Gizachew commited on
Commit
6a8b1de
1 Parent(s): 7f27ea0

End of training

Browse files
README.md CHANGED
@@ -17,8 +17,8 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  This model is a fine-tuned version of [facebook/hubert-base-ls960](https://huggingface.co/facebook/hubert-base-ls960) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 0.3189
21
- - Accuracy: 0.9444
22
 
23
  ## Model description
24
 
@@ -52,17 +52,20 @@ The following hyperparameters were used during training:
52
 
53
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
54
  |:-------------:|:-----:|:----:|:---------------:|:--------:|
55
- | 0.7722 | 1.0 | 223 | 0.4733 | 0.8434 |
56
- | 0.4755 | 2.0 | 446 | 0.4240 | 0.8687 |
57
- | 0.3262 | 3.0 | 669 | 0.2939 | 0.9343 |
58
- | 0.2642 | 4.0 | 892 | 0.3087 | 0.9293 |
59
- | 0.191 | 5.0 | 1115 | 0.3079 | 0.9394 |
60
- | 0.1534 | 6.0 | 1338 | 0.3134 | 0.9394 |
61
- | 0.1571 | 7.0 | 1561 | 0.4009 | 0.9293 |
62
- | 0.1328 | 8.0 | 1784 | 0.3189 | 0.9444 |
63
- | 0.1567 | 9.0 | 2007 | 0.4089 | 0.9192 |
64
- | 0.1043 | 10.0 | 2230 | 0.3429 | 0.9343 |
65
- | 0.1161 | 11.0 | 2453 | 0.3534 | 0.9394 |
 
 
 
66
 
67
 
68
  ### Framework versions
 
17
 
18
  This model is a fine-tuned version of [facebook/hubert-base-ls960](https://huggingface.co/facebook/hubert-base-ls960) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 0.2980
21
+ - Accuracy: 0.9545
22
 
23
  ## Model description
24
 
 
52
 
53
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
54
  |:-------------:|:-----:|:----:|:---------------:|:--------:|
55
+ | 1.1628 | 1.0 | 223 | 0.7126 | 0.7727 |
56
+ | 0.6562 | 2.0 | 446 | 0.5069 | 0.8485 |
57
+ | 0.4199 | 3.0 | 669 | 0.3570 | 0.8990 |
58
+ | 0.325 | 4.0 | 892 | 0.2092 | 0.9394 |
59
+ | 0.2217 | 5.0 | 1115 | 0.2392 | 0.9444 |
60
+ | 0.1831 | 6.0 | 1338 | 0.2754 | 0.9293 |
61
+ | 0.1598 | 7.0 | 1561 | 0.3294 | 0.9343 |
62
+ | 0.1676 | 8.0 | 1784 | 0.2669 | 0.9495 |
63
+ | 0.1597 | 9.0 | 2007 | 0.3438 | 0.9293 |
64
+ | 0.1132 | 10.0 | 2230 | 0.3159 | 0.9444 |
65
+ | 0.1224 | 11.0 | 2453 | 0.2980 | 0.9545 |
66
+ | 0.095 | 12.0 | 2676 | 0.2970 | 0.9444 |
67
+ | 0.1087 | 13.0 | 2899 | 0.3449 | 0.9343 |
68
+ | 0.1254 | 14.0 | 3122 | 0.3198 | 0.9444 |
69
 
70
 
71
  ### Framework versions
all_results.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
- "epoch": 11.0,
3
- "eval_accuracy": 0.9444444179534912,
4
- "eval_loss": 0.3189202845096588,
5
- "eval_runtime": 9.0656,
6
- "eval_samples_per_second": 21.841,
7
- "eval_steps_per_second": 5.515,
8
- "total_flos": 6.273609670944864e+17,
9
- "train_loss": 0.2735439113728912,
10
- "train_runtime": 1317.5896,
11
  "train_samples": 1781,
12
- "train_samples_per_second": 20.276,
13
- "train_steps_per_second": 2.539
14
  }
 
1
  {
2
+ "epoch": 14.0,
3
+ "eval_accuracy": 0.9545454382896423,
4
+ "eval_loss": 0.29796990752220154,
5
+ "eval_runtime": 8.9823,
6
+ "eval_samples_per_second": 22.043,
7
+ "eval_steps_per_second": 5.566,
8
+ "total_flos": 7.9842219974856e+17,
9
+ "train_loss": 0.30023268478028214,
10
+ "train_runtime": 1742.0026,
11
  "train_samples": 1781,
12
+ "train_samples_per_second": 15.336,
13
+ "train_steps_per_second": 1.92
14
  }
config.json CHANGED
@@ -53,20 +53,20 @@
53
  "hidden_dropout_prob": 0.1,
54
  "hidden_size": 768,
55
  "id2label": {
56
- "0": "Neutral",
57
- "1": "Fearful",
58
- "2": "Happy",
59
- "3": "Sad",
60
- "4": "Angry"
61
  },
62
  "initializer_range": 0.02,
63
  "intermediate_size": 3072,
64
  "label2id": {
65
- "Neutral": 0,
66
- "Fearful": 1,
67
- "Happy": 2,
68
- "Sad": 3,
69
- "Angry": 4
70
  },
71
  "layer_norm_eps": 1e-05,
72
  "layerdrop": 0.1,
 
53
  "hidden_dropout_prob": 0.1,
54
  "hidden_size": 768,
55
  "id2label": {
56
+ "0": "01Neutral",
57
+ "1": "02Fearful",
58
+ "2": "03Happy",
59
+ "3": "04Sad",
60
+ "4": "05Angry"
61
  },
62
  "initializer_range": 0.02,
63
  "intermediate_size": 3072,
64
  "label2id": {
65
+ "01Neutral": 0,
66
+ "02Fearful": 1,
67
+ "03Happy": 2,
68
+ "04Sad": 3,
69
+ "05Angry": 4
70
  },
71
  "layer_norm_eps": 1e-05,
72
  "layerdrop": 0.1,
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 11.0,
3
- "eval_accuracy": 0.9444444179534912,
4
- "eval_loss": 0.3189202845096588,
5
- "eval_runtime": 9.0656,
6
- "eval_samples_per_second": 21.841,
7
- "eval_steps_per_second": 5.515
8
  }
 
1
  {
2
+ "epoch": 14.0,
3
+ "eval_accuracy": 0.9545454382896423,
4
+ "eval_loss": 0.29796990752220154,
5
+ "eval_runtime": 8.9823,
6
+ "eval_samples_per_second": 22.043,
7
+ "eval_steps_per_second": 5.566
8
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d9ca5e862fb246d7eaa7bfe0560a4177c3c1b4da82ae4c7765071e7b5f8402b5
3
  size 379890236
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5823ad5dc453f63bc711ccd08c86437beecb7c407032319c5660cf83beb4b2fd
3
  size 379890236
runs/Apr26_13-16-15_60e8964599d3/events.out.tfevents.1714137383.60e8964599d3.35.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4dc75d4da3ec31eec728b4ec220ce2370e52770367a3ff760027c9b8782c8e90
3
+ size 17226
runs/Apr26_13-16-15_60e8964599d3/events.out.tfevents.1714139135.60e8964599d3.35.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f9a7e227a5da52a6596d6b0f73bbcb80b399ac6e1ebdbe2c906f1682b73f3e8
3
+ size 734
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 11.0,
3
- "total_flos": 6.273609670944864e+17,
4
- "train_loss": 0.2735439113728912,
5
- "train_runtime": 1317.5896,
6
  "train_samples": 1781,
7
- "train_samples_per_second": 20.276,
8
- "train_steps_per_second": 2.539
9
  }
 
1
  {
2
+ "epoch": 14.0,
3
+ "total_flos": 7.9842219974856e+17,
4
+ "train_loss": 0.30023268478028214,
5
+ "train_runtime": 1742.0026,
6
  "train_samples": 1781,
7
+ "train_samples_per_second": 15.336,
8
+ "train_steps_per_second": 1.92
9
  }
trainer_state.json CHANGED
@@ -1,297 +1,373 @@
1
  {
2
- "best_metric": 0.9444444179534912,
3
- "best_model_checkpoint": "/kaggle/working/ckpts/checkpoint-1784",
4
- "epoch": 11.0,
5
  "eval_steps": 500,
6
- "global_step": 2453,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.45,
13
- "grad_norm": 3.8796565532684326,
14
  "learning_rate": 9.701046337817639e-06,
15
- "loss": 1.0979,
16
  "step": 100
17
  },
18
  {
19
  "epoch": 0.9,
20
- "grad_norm": 2.6467082500457764,
21
- "learning_rate": 9.4050822122571e-06,
22
- "loss": 0.7722,
23
  "step": 200
24
  },
25
  {
26
  "epoch": 1.0,
27
- "eval_accuracy": 0.8434343338012695,
28
- "eval_loss": 0.4733273386955261,
29
- "eval_runtime": 9.4201,
30
- "eval_samples_per_second": 21.019,
31
- "eval_steps_per_second": 5.308,
32
  "step": 223
33
  },
34
  {
35
  "epoch": 1.35,
36
- "grad_norm": 20.371318817138672,
37
  "learning_rate": 9.106128550074738e-06,
38
- "loss": 0.5871,
39
  "step": 300
40
  },
41
  {
42
  "epoch": 1.79,
43
- "grad_norm": 3.244633674621582,
44
  "learning_rate": 8.807174887892378e-06,
45
- "loss": 0.4755,
46
  "step": 400
47
  },
48
  {
49
  "epoch": 2.0,
50
- "eval_accuracy": 0.868686854839325,
51
- "eval_loss": 0.42396825551986694,
52
- "eval_runtime": 9.3161,
53
- "eval_samples_per_second": 21.254,
54
- "eval_steps_per_second": 5.367,
55
  "step": 446
56
  },
57
  {
58
  "epoch": 2.24,
59
- "grad_norm": 8.883748054504395,
60
  "learning_rate": 8.51121076233184e-06,
61
- "loss": 0.377,
62
  "step": 500
63
  },
64
  {
65
  "epoch": 2.69,
66
- "grad_norm": 1.6546649932861328,
67
- "learning_rate": 8.2152466367713e-06,
68
- "loss": 0.3262,
69
  "step": 600
70
  },
71
  {
72
  "epoch": 3.0,
73
- "eval_accuracy": 0.9343434572219849,
74
- "eval_loss": 0.2939111590385437,
75
- "eval_runtime": 9.4493,
76
- "eval_samples_per_second": 20.954,
77
- "eval_steps_per_second": 5.291,
78
  "step": 669
79
  },
80
  {
81
  "epoch": 3.14,
82
- "grad_norm": 7.946984767913818,
83
  "learning_rate": 7.916292974588939e-06,
84
- "loss": 0.2347,
85
  "step": 700
86
  },
87
  {
88
  "epoch": 3.59,
89
- "grad_norm": 9.960402488708496,
90
  "learning_rate": 7.617339312406578e-06,
91
- "loss": 0.2642,
92
  "step": 800
93
  },
94
  {
95
  "epoch": 4.0,
96
- "eval_accuracy": 0.9292929172515869,
97
- "eval_loss": 0.3087417781352997,
98
- "eval_runtime": 9.224,
99
- "eval_samples_per_second": 21.466,
100
- "eval_steps_per_second": 5.421,
101
  "step": 892
102
  },
103
  {
104
  "epoch": 4.04,
105
- "grad_norm": 87.66458129882812,
106
  "learning_rate": 7.318385650224216e-06,
107
- "loss": 0.2608,
108
  "step": 900
109
  },
110
  {
111
  "epoch": 4.48,
112
- "grad_norm": 42.06097412109375,
113
  "learning_rate": 7.019431988041854e-06,
114
- "loss": 0.213,
115
  "step": 1000
116
  },
117
  {
118
  "epoch": 4.93,
119
- "grad_norm": 21.227588653564453,
120
- "learning_rate": 6.723467862481315e-06,
121
- "loss": 0.191,
122
  "step": 1100
123
  },
124
  {
125
  "epoch": 5.0,
126
- "eval_accuracy": 0.939393937587738,
127
- "eval_loss": 0.30786794424057007,
128
- "eval_runtime": 9.2259,
129
- "eval_samples_per_second": 21.461,
130
- "eval_steps_per_second": 5.42,
131
  "step": 1115
132
  },
133
  {
134
  "epoch": 5.38,
135
- "grad_norm": 0.09492979198694229,
136
- "learning_rate": 6.424514200298954e-06,
137
- "loss": 0.1891,
138
  "step": 1200
139
  },
140
  {
141
  "epoch": 5.83,
142
- "grad_norm": 22.492895126342773,
143
- "learning_rate": 6.1255605381165925e-06,
144
- "loss": 0.1534,
145
  "step": 1300
146
  },
147
  {
148
  "epoch": 6.0,
149
- "eval_accuracy": 0.939393937587738,
150
- "eval_loss": 0.3133719265460968,
151
- "eval_runtime": 9.3193,
152
- "eval_samples_per_second": 21.246,
153
- "eval_steps_per_second": 5.365,
154
  "step": 1338
155
  },
156
  {
157
  "epoch": 6.28,
158
- "grad_norm": 0.05382364243268967,
159
- "learning_rate": 5.826606875934231e-06,
160
- "loss": 0.1825,
161
  "step": 1400
162
  },
163
  {
164
  "epoch": 6.73,
165
- "grad_norm": 5.18447732925415,
166
- "learning_rate": 5.527653213751869e-06,
167
- "loss": 0.1571,
168
  "step": 1500
169
  },
170
  {
171
  "epoch": 7.0,
172
- "eval_accuracy": 0.9292929172515869,
173
- "eval_loss": 0.40089717507362366,
174
- "eval_runtime": 9.2909,
175
- "eval_samples_per_second": 21.311,
176
- "eval_steps_per_second": 5.382,
177
  "step": 1561
178
  },
179
  {
180
  "epoch": 7.17,
181
- "grad_norm": 0.036003902554512024,
182
  "learning_rate": 5.228699551569507e-06,
183
- "loss": 0.1518,
184
  "step": 1600
185
  },
186
  {
187
  "epoch": 7.62,
188
- "grad_norm": 0.10409737378358841,
189
  "learning_rate": 4.929745889387145e-06,
190
- "loss": 0.1328,
191
  "step": 1700
192
  },
193
  {
194
  "epoch": 8.0,
195
- "eval_accuracy": 0.9444444179534912,
196
- "eval_loss": 0.3189202845096588,
197
- "eval_runtime": 9.3287,
198
- "eval_samples_per_second": 21.225,
199
- "eval_steps_per_second": 5.36,
200
  "step": 1784
201
  },
202
  {
203
  "epoch": 8.07,
204
- "grad_norm": 6.580456733703613,
205
  "learning_rate": 4.630792227204783e-06,
206
- "loss": 0.1127,
207
  "step": 1800
208
  },
209
  {
210
  "epoch": 8.52,
211
- "grad_norm": 0.12464825063943863,
212
  "learning_rate": 4.3318385650224224e-06,
213
- "loss": 0.1333,
214
  "step": 1900
215
  },
216
  {
217
  "epoch": 8.97,
218
- "grad_norm": 127.78559112548828,
219
  "learning_rate": 4.03288490284006e-06,
220
- "loss": 0.1567,
221
  "step": 2000
222
  },
223
  {
224
  "epoch": 9.0,
225
- "eval_accuracy": 0.9191918969154358,
226
- "eval_loss": 0.40891000628471375,
227
- "eval_runtime": 9.2849,
228
- "eval_samples_per_second": 21.325,
229
- "eval_steps_per_second": 5.385,
230
  "step": 2007
231
  },
232
  {
233
  "epoch": 9.42,
234
- "grad_norm": 0.03460687771439552,
235
  "learning_rate": 3.7339312406576984e-06,
236
- "loss": 0.1313,
237
  "step": 2100
238
  },
239
  {
240
  "epoch": 9.87,
241
- "grad_norm": 3.1638216972351074,
242
  "learning_rate": 3.4349775784753366e-06,
243
- "loss": 0.1043,
244
  "step": 2200
245
  },
246
  {
247
  "epoch": 10.0,
248
- "eval_accuracy": 0.9343434572219849,
249
- "eval_loss": 0.34286314249038696,
250
- "eval_runtime": 9.3365,
251
- "eval_samples_per_second": 21.207,
252
- "eval_steps_per_second": 5.355,
253
  "step": 2230
254
  },
255
  {
256
  "epoch": 10.31,
257
- "grad_norm": 0.462053507566452,
258
  "learning_rate": 3.136023916292975e-06,
259
- "loss": 0.1551,
260
  "step": 2300
261
  },
262
  {
263
  "epoch": 10.76,
264
- "grad_norm": 0.5621947050094604,
265
  "learning_rate": 2.8370702541106134e-06,
266
- "loss": 0.1161,
267
  "step": 2400
268
  },
269
  {
270
  "epoch": 11.0,
271
- "eval_accuracy": 0.939393937587738,
272
- "eval_loss": 0.3534471094608307,
273
- "eval_runtime": 9.2307,
274
- "eval_samples_per_second": 21.45,
275
- "eval_steps_per_second": 5.417,
276
  "step": 2453
277
  },
278
  {
279
- "epoch": 11.0,
280
- "step": 2453,
281
- "total_flos": 6.273609670944864e+17,
282
- "train_loss": 0.2735439113728912,
283
- "train_runtime": 1317.5896,
284
- "train_samples_per_second": 20.276,
285
- "train_steps_per_second": 2.539
286
  },
287
  {
288
- "epoch": 11.0,
 
 
 
 
 
 
 
289
  "eval_accuracy": 0.9444444179534912,
290
- "eval_loss": 0.3189202845096588,
291
- "eval_runtime": 9.0656,
292
- "eval_samples_per_second": 21.841,
293
- "eval_steps_per_second": 5.515,
294
- "step": 2453
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
  }
296
  ],
297
  "logging_steps": 100,
@@ -299,7 +375,7 @@
299
  "num_input_tokens_seen": 0,
300
  "num_train_epochs": 15,
301
  "save_steps": 500,
302
- "total_flos": 6.273609670944864e+17,
303
  "train_batch_size": 4,
304
  "trial_name": null,
305
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.9545454382896423,
3
+ "best_model_checkpoint": "/kaggle/working/ckpts/checkpoint-2453",
4
+ "epoch": 14.0,
5
  "eval_steps": 500,
6
+ "global_step": 3122,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.45,
13
+ "grad_norm": 2.3771965503692627,
14
  "learning_rate": 9.701046337817639e-06,
15
+ "loss": 1.5285,
16
  "step": 100
17
  },
18
  {
19
  "epoch": 0.9,
20
+ "grad_norm": 2.858564615249634,
21
+ "learning_rate": 9.402092675635277e-06,
22
+ "loss": 1.1628,
23
  "step": 200
24
  },
25
  {
26
  "epoch": 1.0,
27
+ "eval_accuracy": 0.7727272510528564,
28
+ "eval_loss": 0.712559700012207,
29
+ "eval_runtime": 9.2054,
30
+ "eval_samples_per_second": 21.509,
31
+ "eval_steps_per_second": 5.432,
32
  "step": 223
33
  },
34
  {
35
  "epoch": 1.35,
36
+ "grad_norm": 12.418754577636719,
37
  "learning_rate": 9.106128550074738e-06,
38
+ "loss": 0.8286,
39
  "step": 300
40
  },
41
  {
42
  "epoch": 1.79,
43
+ "grad_norm": 7.768007755279541,
44
  "learning_rate": 8.807174887892378e-06,
45
+ "loss": 0.6562,
46
  "step": 400
47
  },
48
  {
49
  "epoch": 2.0,
50
+ "eval_accuracy": 0.8484848737716675,
51
+ "eval_loss": 0.5068599581718445,
52
+ "eval_runtime": 9.5278,
53
+ "eval_samples_per_second": 20.781,
54
+ "eval_steps_per_second": 5.248,
55
  "step": 446
56
  },
57
  {
58
  "epoch": 2.24,
59
+ "grad_norm": 9.381482124328613,
60
  "learning_rate": 8.51121076233184e-06,
61
+ "loss": 0.5053,
62
  "step": 500
63
  },
64
  {
65
  "epoch": 2.69,
66
+ "grad_norm": 2.293752908706665,
67
+ "learning_rate": 8.212257100149478e-06,
68
+ "loss": 0.4199,
69
  "step": 600
70
  },
71
  {
72
  "epoch": 3.0,
73
+ "eval_accuracy": 0.8989899158477783,
74
+ "eval_loss": 0.356963574886322,
75
+ "eval_runtime": 9.2472,
76
+ "eval_samples_per_second": 21.412,
77
+ "eval_steps_per_second": 5.407,
78
  "step": 669
79
  },
80
  {
81
  "epoch": 3.14,
82
+ "grad_norm": 23.293209075927734,
83
  "learning_rate": 7.916292974588939e-06,
84
+ "loss": 0.3121,
85
  "step": 700
86
  },
87
  {
88
  "epoch": 3.59,
89
+ "grad_norm": 3.8754687309265137,
90
  "learning_rate": 7.617339312406578e-06,
91
+ "loss": 0.325,
92
  "step": 800
93
  },
94
  {
95
  "epoch": 4.0,
96
+ "eval_accuracy": 0.939393937587738,
97
+ "eval_loss": 0.20920716226100922,
98
+ "eval_runtime": 9.2568,
99
+ "eval_samples_per_second": 21.39,
100
+ "eval_steps_per_second": 5.401,
101
  "step": 892
102
  },
103
  {
104
  "epoch": 4.04,
105
+ "grad_norm": 62.81392288208008,
106
  "learning_rate": 7.318385650224216e-06,
107
+ "loss": 0.2896,
108
  "step": 900
109
  },
110
  {
111
  "epoch": 4.48,
112
+ "grad_norm": 35.08163833618164,
113
  "learning_rate": 7.019431988041854e-06,
114
+ "loss": 0.2535,
115
  "step": 1000
116
  },
117
  {
118
  "epoch": 4.93,
119
+ "grad_norm": 14.269490242004395,
120
+ "learning_rate": 6.720478325859492e-06,
121
+ "loss": 0.2217,
122
  "step": 1100
123
  },
124
  {
125
  "epoch": 5.0,
126
+ "eval_accuracy": 0.9444444179534912,
127
+ "eval_loss": 0.23924072086811066,
128
+ "eval_runtime": 9.2044,
129
+ "eval_samples_per_second": 21.511,
130
+ "eval_steps_per_second": 5.432,
131
  "step": 1115
132
  },
133
  {
134
  "epoch": 5.38,
135
+ "grad_norm": 0.41719043254852295,
136
+ "learning_rate": 6.421524663677131e-06,
137
+ "loss": 0.2165,
138
  "step": 1200
139
  },
140
  {
141
  "epoch": 5.83,
142
+ "grad_norm": 1.484471321105957,
143
+ "learning_rate": 6.1225710014947695e-06,
144
+ "loss": 0.1831,
145
  "step": 1300
146
  },
147
  {
148
  "epoch": 6.0,
149
+ "eval_accuracy": 0.9292929172515869,
150
+ "eval_loss": 0.27538299560546875,
151
+ "eval_runtime": 9.1435,
152
+ "eval_samples_per_second": 21.655,
153
+ "eval_steps_per_second": 5.468,
154
  "step": 1338
155
  },
156
  {
157
  "epoch": 6.28,
158
+ "grad_norm": 0.09743738174438477,
159
+ "learning_rate": 5.823617339312408e-06,
160
+ "loss": 0.2059,
161
  "step": 1400
162
  },
163
  {
164
  "epoch": 6.73,
165
+ "grad_norm": 0.3065042793750763,
166
+ "learning_rate": 5.524663677130046e-06,
167
+ "loss": 0.1598,
168
  "step": 1500
169
  },
170
  {
171
  "epoch": 7.0,
172
+ "eval_accuracy": 0.9343434572219849,
173
+ "eval_loss": 0.3294394910335541,
174
+ "eval_runtime": 9.1064,
175
+ "eval_samples_per_second": 21.743,
176
+ "eval_steps_per_second": 5.491,
177
  "step": 1561
178
  },
179
  {
180
  "epoch": 7.17,
181
+ "grad_norm": 0.05342373996973038,
182
  "learning_rate": 5.228699551569507e-06,
183
+ "loss": 0.1455,
184
  "step": 1600
185
  },
186
  {
187
  "epoch": 7.62,
188
+ "grad_norm": 1.5460679531097412,
189
  "learning_rate": 4.929745889387145e-06,
190
+ "loss": 0.1676,
191
  "step": 1700
192
  },
193
  {
194
  "epoch": 8.0,
195
+ "eval_accuracy": 0.9494949579238892,
196
+ "eval_loss": 0.2668905258178711,
197
+ "eval_runtime": 9.2118,
198
+ "eval_samples_per_second": 21.494,
199
+ "eval_steps_per_second": 5.428,
200
  "step": 1784
201
  },
202
  {
203
  "epoch": 8.07,
204
+ "grad_norm": 17.537992477416992,
205
  "learning_rate": 4.630792227204783e-06,
206
+ "loss": 0.1762,
207
  "step": 1800
208
  },
209
  {
210
  "epoch": 8.52,
211
+ "grad_norm": 0.20349286496639252,
212
  "learning_rate": 4.3318385650224224e-06,
213
+ "loss": 0.1566,
214
  "step": 1900
215
  },
216
  {
217
  "epoch": 8.97,
218
+ "grad_norm": 15.300110816955566,
219
  "learning_rate": 4.03288490284006e-06,
220
+ "loss": 0.1597,
221
  "step": 2000
222
  },
223
  {
224
  "epoch": 9.0,
225
+ "eval_accuracy": 0.9292929172515869,
226
+ "eval_loss": 0.34383586049079895,
227
+ "eval_runtime": 9.179,
228
+ "eval_samples_per_second": 21.571,
229
+ "eval_steps_per_second": 5.447,
230
  "step": 2007
231
  },
232
  {
233
  "epoch": 9.42,
234
+ "grad_norm": 0.4512959420681,
235
  "learning_rate": 3.7339312406576984e-06,
236
+ "loss": 0.1416,
237
  "step": 2100
238
  },
239
  {
240
  "epoch": 9.87,
241
+ "grad_norm": 0.7455862760543823,
242
  "learning_rate": 3.4349775784753366e-06,
243
+ "loss": 0.1132,
244
  "step": 2200
245
  },
246
  {
247
  "epoch": 10.0,
248
+ "eval_accuracy": 0.9444444179534912,
249
+ "eval_loss": 0.31586208939552307,
250
+ "eval_runtime": 9.1631,
251
+ "eval_samples_per_second": 21.608,
252
+ "eval_steps_per_second": 5.457,
253
  "step": 2230
254
  },
255
  {
256
  "epoch": 10.31,
257
+ "grad_norm": 0.25966259837150574,
258
  "learning_rate": 3.136023916292975e-06,
259
+ "loss": 0.1654,
260
  "step": 2300
261
  },
262
  {
263
  "epoch": 10.76,
264
+ "grad_norm": 0.45347365736961365,
265
  "learning_rate": 2.8370702541106134e-06,
266
+ "loss": 0.1224,
267
  "step": 2400
268
  },
269
  {
270
  "epoch": 11.0,
271
+ "eval_accuracy": 0.9545454382896423,
272
+ "eval_loss": 0.29796990752220154,
273
+ "eval_runtime": 9.1354,
274
+ "eval_samples_per_second": 21.674,
275
+ "eval_steps_per_second": 5.473,
276
  "step": 2453
277
  },
278
  {
279
+ "epoch": 11.21,
280
+ "grad_norm": 27.043094635009766,
281
+ "learning_rate": 2.538116591928251e-06,
282
+ "loss": 0.1021,
283
+ "step": 2500
 
 
284
  },
285
  {
286
+ "epoch": 11.66,
287
+ "grad_norm": 72.37726593017578,
288
+ "learning_rate": 2.2391629297458894e-06,
289
+ "loss": 0.095,
290
+ "step": 2600
291
+ },
292
+ {
293
+ "epoch": 12.0,
294
  "eval_accuracy": 0.9444444179534912,
295
+ "eval_loss": 0.2970119118690491,
296
+ "eval_runtime": 9.1388,
297
+ "eval_samples_per_second": 21.666,
298
+ "eval_steps_per_second": 5.471,
299
+ "step": 2676
300
+ },
301
+ {
302
+ "epoch": 12.11,
303
+ "grad_norm": 0.6068007946014404,
304
+ "learning_rate": 1.940209267563528e-06,
305
+ "loss": 0.1307,
306
+ "step": 2700
307
+ },
308
+ {
309
+ "epoch": 12.56,
310
+ "grad_norm": 4.567564964294434,
311
+ "learning_rate": 1.641255605381166e-06,
312
+ "loss": 0.1087,
313
+ "step": 2800
314
+ },
315
+ {
316
+ "epoch": 13.0,
317
+ "eval_accuracy": 0.9343434572219849,
318
+ "eval_loss": 0.34486597776412964,
319
+ "eval_runtime": 9.3094,
320
+ "eval_samples_per_second": 21.269,
321
+ "eval_steps_per_second": 5.371,
322
+ "step": 2899
323
+ },
324
+ {
325
+ "epoch": 13.0,
326
+ "grad_norm": 41.62958908081055,
327
+ "learning_rate": 1.3423019431988044e-06,
328
+ "loss": 0.0917,
329
+ "step": 2900
330
+ },
331
+ {
332
+ "epoch": 13.45,
333
+ "grad_norm": 0.026164406910538673,
334
+ "learning_rate": 1.0433482810164425e-06,
335
+ "loss": 0.0904,
336
+ "step": 3000
337
+ },
338
+ {
339
+ "epoch": 13.9,
340
+ "grad_norm": 52.47389221191406,
341
+ "learning_rate": 7.443946188340807e-07,
342
+ "loss": 0.1254,
343
+ "step": 3100
344
+ },
345
+ {
346
+ "epoch": 14.0,
347
+ "eval_accuracy": 0.9444444179534912,
348
+ "eval_loss": 0.31978654861450195,
349
+ "eval_runtime": 9.2595,
350
+ "eval_samples_per_second": 21.384,
351
+ "eval_steps_per_second": 5.4,
352
+ "step": 3122
353
+ },
354
+ {
355
+ "epoch": 14.0,
356
+ "step": 3122,
357
+ "total_flos": 7.9842219974856e+17,
358
+ "train_loss": 0.30023268478028214,
359
+ "train_runtime": 1742.0026,
360
+ "train_samples_per_second": 15.336,
361
+ "train_steps_per_second": 1.92
362
+ },
363
+ {
364
+ "epoch": 14.0,
365
+ "eval_accuracy": 0.9545454382896423,
366
+ "eval_loss": 0.29796990752220154,
367
+ "eval_runtime": 8.9823,
368
+ "eval_samples_per_second": 22.043,
369
+ "eval_steps_per_second": 5.566,
370
+ "step": 3122
371
  }
372
  ],
373
  "logging_steps": 100,
 
375
  "num_input_tokens_seen": 0,
376
  "num_train_epochs": 15,
377
  "save_steps": 500,
378
+ "total_flos": 7.9842219974856e+17,
379
  "train_batch_size": 4,
380
  "trial_name": null,
381
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:de65037c614a22836253ad31eda6e76236b1e78bf69197af95578707a3ab6bbe
3
  size 4984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4f11d524efea23fe3851521cb1d052cadfa01563a005deccc39e6b7563628ad
3
  size 4984