yashcode00 commited on
Commit
4320ea0
·
1 Parent(s): 5d74d76

yashcode00/wav2vec2-large-xlsr-indian-language-classification-featureExtractor

Browse files
README.md CHANGED
@@ -17,8 +17,8 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  This model is a fine-tuned version of [yashcode00/wav2vec2-large-xlsr-indian-language-classification-featureExtractor](https://huggingface.co/yashcode00/wav2vec2-large-xlsr-indian-language-classification-featureExtractor) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 0.3117
21
- - Accuracy: 0.9323
22
 
23
  ## Model description
24
 
@@ -38,35 +38,33 @@ More information needed
38
 
39
  The following hyperparameters were used during training:
40
  - learning_rate: 5e-05
41
- - train_batch_size: 8
42
  - eval_batch_size: 8
43
  - seed: 42
44
  - gradient_accumulation_steps: 8
45
- - total_train_batch_size: 64
46
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
47
  - lr_scheduler_type: linear
48
- - num_epochs: 60
49
 
50
  ### Training results
51
 
52
- | Training Loss | Epoch | Step | Validation Loss | Accuracy |
53
- |:-------------:|:-----:|:-----:|:---------------:|:--------:|
54
- | 0.0559 | 5.28 | 1000 | 0.3097 | 0.9191 |
55
- | 0.047 | 10.56 | 2000 | 0.3482 | 0.9191 |
56
- | 0.0402 | 15.84 | 3000 | 0.3890 | 0.9080 |
57
- | 0.0328 | 21.12 | 4000 | 0.3746 | 0.9150 |
58
- | 0.0189 | 26.4 | 5000 | 0.4274 | 0.9113 |
59
- | 0.0187 | 31.68 | 6000 | 0.4131 | 0.9101 |
60
- | 0.0203 | 36.96 | 7000 | 0.3643 | 0.9237 |
61
- | 0.0147 | 42.24 | 8000 | 0.3574 | 0.9295 |
62
- | 0.0148 | 47.52 | 9000 | 0.3653 | 0.9220 |
63
- | 0.0137 | 52.81 | 10000 | 0.3257 | 0.9352 |
64
- | 0.0174 | 58.09 | 11000 | 0.3097 | 0.9340 |
65
 
66
 
67
  ### Framework versions
68
 
69
- - Transformers 4.32.1
70
  - Pytorch 2.0.0
71
  - Datasets 2.11.0
72
  - Tokenizers 0.13.3
 
17
 
18
  This model is a fine-tuned version of [yashcode00/wav2vec2-large-xlsr-indian-language-classification-featureExtractor](https://huggingface.co/yashcode00/wav2vec2-large-xlsr-indian-language-classification-featureExtractor) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 0.2965
21
+ - Accuracy: 0.9319
22
 
23
  ## Model description
24
 
 
38
 
39
  The following hyperparameters were used during training:
40
  - learning_rate: 5e-05
41
+ - train_batch_size: 16
42
  - eval_batch_size: 8
43
  - seed: 42
44
  - gradient_accumulation_steps: 8
45
+ - total_train_batch_size: 128
46
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
47
  - lr_scheduler_type: linear
48
+ - num_epochs: 100
49
 
50
  ### Training results
51
 
52
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy |
53
+ |:-------------:|:-----:|:----:|:---------------:|:--------:|
54
+ | 0.0272 | 10.55 | 1000 | 0.2915 | 0.9257 |
55
+ | 0.0172 | 21.11 | 2000 | 0.2769 | 0.9332 |
56
+ | 0.0178 | 31.66 | 3000 | 0.2928 | 0.9323 |
57
+ | 0.0145 | 42.22 | 4000 | 0.2718 | 0.9356 |
58
+ | 0.0147 | 52.77 | 5000 | 0.2676 | 0.9348 |
59
+ | 0.0135 | 63.32 | 6000 | 0.2731 | 0.9398 |
60
+ | 0.0105 | 73.88 | 7000 | 0.3145 | 0.9336 |
61
+ | 0.0075 | 84.43 | 8000 | 0.2971 | 0.9319 |
62
+ | 0.0078 | 94.99 | 9000 | 0.2950 | 0.9328 |
 
 
63
 
64
 
65
  ### Framework versions
66
 
67
+ - Transformers 4.33.0
68
  - Pytorch 2.0.0
69
  - Datasets 2.11.0
70
  - Tokenizers 0.13.3
all_results.json CHANGED
@@ -1,15 +1,15 @@
1
  {
2
- "epoch": 59.88,
3
- "eval_accuracy": 0.9323432445526123,
4
- "eval_loss": 0.3117374777793884,
5
- "eval_runtime": 50.8983,
6
  "eval_samples": 2424,
7
- "eval_samples_per_second": 47.624,
8
- "eval_steps_per_second": 5.953,
9
- "total_flos": 2.20514636224512e+19,
10
- "train_loss": 0.030625741817122836,
11
- "train_runtime": 25244.9182,
12
  "train_samples": 12120,
13
- "train_samples_per_second": 28.806,
14
- "train_steps_per_second": 0.449
15
  }
 
1
  {
2
+ "epoch": 99.21,
3
+ "eval_accuracy": 0.9319307208061218,
4
+ "eval_loss": 0.29647526144981384,
5
+ "eval_runtime": 50.5435,
6
  "eval_samples": 2424,
7
+ "eval_samples_per_second": 47.959,
8
+ "eval_steps_per_second": 5.995,
9
+ "total_flos": 3.653391792237703e+19,
10
+ "train_loss": 0.01680715578667661,
11
+ "train_runtime": 36294.494,
12
  "train_samples": 12120,
13
+ "train_samples_per_second": 33.393,
14
+ "train_steps_per_second": 0.259
15
  }
config.json CHANGED
@@ -139,7 +139,7 @@
139
  1
140
  ],
141
  "torch_dtype": "float32",
142
- "transformers_version": "4.32.1",
143
  "use_weighted_layer_sum": false,
144
  "vocab_size": 32,
145
  "xvector_output_dim": 512
 
139
  1
140
  ],
141
  "torch_dtype": "float32",
142
+ "transformers_version": "4.33.0",
143
  "use_weighted_layer_sum": false,
144
  "vocab_size": 32,
145
  "xvector_output_dim": 512
eval_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 59.88,
3
- "eval_accuracy": 0.9323432445526123,
4
- "eval_loss": 0.3117374777793884,
5
- "eval_runtime": 50.8983,
6
  "eval_samples": 2424,
7
- "eval_samples_per_second": 47.624,
8
- "eval_steps_per_second": 5.953
9
  }
 
1
  {
2
+ "epoch": 99.21,
3
+ "eval_accuracy": 0.9319307208061218,
4
+ "eval_loss": 0.29647526144981384,
5
+ "eval_runtime": 50.5435,
6
  "eval_samples": 2424,
7
+ "eval_samples_per_second": 47.959,
8
+ "eval_steps_per_second": 5.995
9
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:41ca0eeca582f52ad187db0b010f7bf152237a85fc348fd2af50256d74874d7a
3
  size 1266146037
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:360b8fd02ab317b95f64242ee0a98ef6b12d72844557ec2da4385fe18fe39db6
3
  size 1266146037
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 59.88,
3
- "total_flos": 2.20514636224512e+19,
4
- "train_loss": 0.030625741817122836,
5
- "train_runtime": 25244.9182,
6
  "train_samples": 12120,
7
- "train_samples_per_second": 28.806,
8
- "train_steps_per_second": 0.449
9
  }
 
1
  {
2
+ "epoch": 99.21,
3
+ "total_flos": 3.653391792237703e+19,
4
+ "train_loss": 0.01680715578667661,
5
+ "train_runtime": 36294.494,
6
  "train_samples": 12120,
7
+ "train_samples_per_second": 33.393,
8
+ "train_steps_per_second": 0.259
9
  }
trainer_state.json CHANGED
@@ -1,814 +1,682 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 59.881188118811885,
5
  "eval_steps": 1000,
6
- "global_step": 11340,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.53,
13
- "learning_rate": 4.956349206349207e-05,
14
- "loss": 0.0798,
15
  "step": 100
16
  },
17
  {
18
- "epoch": 1.06,
19
- "learning_rate": 4.912257495590829e-05,
20
- "loss": 0.0816,
21
  "step": 200
22
  },
23
  {
24
- "epoch": 1.58,
25
- "learning_rate": 4.868165784832452e-05,
26
- "loss": 0.0761,
27
  "step": 300
28
  },
29
  {
30
- "epoch": 2.11,
31
- "learning_rate": 4.824514991181658e-05,
32
- "loss": 0.0723,
33
  "step": 400
34
  },
35
  {
36
- "epoch": 2.64,
37
- "learning_rate": 4.7804232804232806e-05,
38
- "loss": 0.0601,
39
  "step": 500
40
  },
41
  {
42
- "epoch": 3.17,
43
- "learning_rate": 4.736331569664903e-05,
44
- "loss": 0.0593,
45
  "step": 600
46
  },
47
  {
48
- "epoch": 3.7,
49
- "learning_rate": 4.692239858906526e-05,
50
- "loss": 0.0692,
51
  "step": 700
52
  },
53
  {
54
- "epoch": 4.22,
55
- "learning_rate": 4.648148148148148e-05,
56
- "loss": 0.0587,
57
  "step": 800
58
  },
59
  {
60
- "epoch": 4.75,
61
- "learning_rate": 4.604056437389771e-05,
62
- "loss": 0.0598,
63
  "step": 900
64
  },
65
  {
66
- "epoch": 5.28,
67
- "learning_rate": 4.559964726631393e-05,
68
- "loss": 0.0559,
69
  "step": 1000
70
  },
71
  {
72
- "epoch": 5.28,
73
- "eval_accuracy": 0.9191418886184692,
74
- "eval_loss": 0.3096904754638672,
75
- "eval_runtime": 51.3243,
76
- "eval_samples_per_second": 47.229,
77
- "eval_steps_per_second": 5.904,
78
  "step": 1000
79
  },
80
  {
81
- "epoch": 5.81,
82
- "learning_rate": 4.515873015873016e-05,
83
- "loss": 0.0552,
84
  "step": 1100
85
  },
86
  {
87
- "epoch": 6.34,
88
- "learning_rate": 4.471781305114639e-05,
89
- "loss": 0.0519,
90
  "step": 1200
91
  },
92
  {
93
- "epoch": 6.86,
94
- "learning_rate": 4.428130511463845e-05,
95
- "loss": 0.0536,
96
  "step": 1300
97
  },
98
  {
99
- "epoch": 7.39,
100
- "learning_rate": 4.3840388007054675e-05,
101
- "loss": 0.0438,
102
  "step": 1400
103
  },
104
  {
105
- "epoch": 7.92,
106
- "learning_rate": 4.33994708994709e-05,
107
- "loss": 0.0436,
108
  "step": 1500
109
  },
110
  {
111
- "epoch": 8.45,
112
- "learning_rate": 4.295855379188713e-05,
113
- "loss": 0.0447,
114
  "step": 1600
115
  },
116
  {
117
- "epoch": 8.98,
118
- "learning_rate": 4.2517636684303355e-05,
119
- "loss": 0.0535,
120
  "step": 1700
121
  },
122
  {
123
- "epoch": 9.5,
124
- "learning_rate": 4.207671957671958e-05,
125
- "loss": 0.0467,
126
  "step": 1800
127
  },
128
  {
129
- "epoch": 10.03,
130
- "learning_rate": 4.16358024691358e-05,
131
- "loss": 0.0557,
132
  "step": 1900
133
  },
134
  {
135
- "epoch": 10.56,
136
- "learning_rate": 4.1194885361552036e-05,
137
- "loss": 0.047,
138
  "step": 2000
139
  },
140
  {
141
- "epoch": 10.56,
142
- "eval_accuracy": 0.9191418886184692,
143
- "eval_loss": 0.34823155403137207,
144
- "eval_runtime": 51.4649,
145
- "eval_samples_per_second": 47.1,
146
- "eval_steps_per_second": 5.888,
147
  "step": 2000
148
  },
149
  {
150
- "epoch": 11.09,
151
- "learning_rate": 4.0753968253968256e-05,
152
- "loss": 0.0397,
153
  "step": 2100
154
  },
155
  {
156
- "epoch": 11.62,
157
- "learning_rate": 4.031305114638448e-05,
158
- "loss": 0.0475,
159
  "step": 2200
160
  },
161
  {
162
- "epoch": 12.15,
163
- "learning_rate": 3.987213403880071e-05,
164
- "loss": 0.0478,
165
  "step": 2300
166
  },
167
  {
168
- "epoch": 12.67,
169
- "learning_rate": 3.9431216931216936e-05,
170
- "loss": 0.0421,
171
  "step": 2400
172
  },
173
  {
174
- "epoch": 13.2,
175
- "learning_rate": 3.8994708994709e-05,
176
- "loss": 0.0408,
177
  "step": 2500
178
  },
179
  {
180
- "epoch": 13.73,
181
- "learning_rate": 3.8553791887125224e-05,
182
- "loss": 0.0368,
183
  "step": 2600
184
  },
185
  {
186
- "epoch": 14.26,
187
- "learning_rate": 3.8112874779541445e-05,
188
- "loss": 0.0379,
189
  "step": 2700
190
  },
191
  {
192
- "epoch": 14.79,
193
- "learning_rate": 3.767195767195768e-05,
194
- "loss": 0.052,
195
  "step": 2800
196
  },
197
  {
198
- "epoch": 15.31,
199
- "learning_rate": 3.72310405643739e-05,
200
- "loss": 0.0424,
201
  "step": 2900
202
  },
203
  {
204
- "epoch": 15.84,
205
- "learning_rate": 3.6790123456790125e-05,
206
- "loss": 0.0402,
207
  "step": 3000
208
  },
209
  {
210
- "epoch": 15.84,
211
- "eval_accuracy": 0.9080032706260681,
212
- "eval_loss": 0.3889801502227783,
213
- "eval_runtime": 50.7451,
214
- "eval_samples_per_second": 47.768,
215
- "eval_steps_per_second": 5.971,
216
  "step": 3000
217
  },
218
  {
219
- "epoch": 16.37,
220
- "learning_rate": 3.634920634920635e-05,
221
- "loss": 0.0417,
222
  "step": 3100
223
  },
224
  {
225
- "epoch": 16.9,
226
- "learning_rate": 3.590828924162258e-05,
227
- "loss": 0.0419,
228
  "step": 3200
229
  },
230
  {
231
- "epoch": 17.43,
232
- "learning_rate": 3.54673721340388e-05,
233
- "loss": 0.0439,
234
  "step": 3300
235
  },
236
  {
237
- "epoch": 17.95,
238
- "learning_rate": 3.502645502645503e-05,
239
- "loss": 0.0446,
240
  "step": 3400
241
  },
242
  {
243
- "epoch": 18.48,
244
- "learning_rate": 3.458553791887125e-05,
245
- "loss": 0.0339,
246
  "step": 3500
247
  },
248
  {
249
- "epoch": 19.01,
250
- "learning_rate": 3.414462081128748e-05,
251
- "loss": 0.0301,
252
  "step": 3600
253
  },
254
  {
255
- "epoch": 19.54,
256
- "learning_rate": 3.3703703703703706e-05,
257
- "loss": 0.0277,
258
  "step": 3700
259
  },
260
  {
261
- "epoch": 20.07,
262
- "learning_rate": 3.326278659611993e-05,
263
- "loss": 0.0333,
264
  "step": 3800
265
  },
266
  {
267
- "epoch": 20.59,
268
- "learning_rate": 3.282186948853615e-05,
269
- "loss": 0.0298,
270
  "step": 3900
271
  },
272
  {
273
- "epoch": 21.12,
274
- "learning_rate": 3.2380952380952386e-05,
275
- "loss": 0.0328,
276
  "step": 4000
277
  },
278
  {
279
- "epoch": 21.12,
280
- "eval_accuracy": 0.9150164723396301,
281
- "eval_loss": 0.37457939982414246,
282
- "eval_runtime": 50.3165,
283
- "eval_samples_per_second": 48.175,
284
- "eval_steps_per_second": 6.022,
285
  "step": 4000
286
  },
287
  {
288
- "epoch": 21.65,
289
- "learning_rate": 3.1940035273368606e-05,
290
- "loss": 0.0298,
291
  "step": 4100
292
  },
293
  {
294
- "epoch": 22.18,
295
- "learning_rate": 3.149911816578483e-05,
296
- "loss": 0.0314,
297
  "step": 4200
298
  },
299
  {
300
- "epoch": 22.71,
301
- "learning_rate": 3.105820105820106e-05,
302
- "loss": 0.0253,
303
  "step": 4300
304
  },
305
  {
306
- "epoch": 23.23,
307
- "learning_rate": 3.061728395061729e-05,
308
- "loss": 0.0339,
309
  "step": 4400
310
  },
311
  {
312
- "epoch": 23.76,
313
- "learning_rate": 3.017636684303351e-05,
314
- "loss": 0.0266,
315
  "step": 4500
316
  },
317
  {
318
- "epoch": 24.29,
319
- "learning_rate": 2.973544973544974e-05,
320
- "loss": 0.0361,
321
  "step": 4600
322
  },
323
  {
324
- "epoch": 24.82,
325
- "learning_rate": 2.929453262786596e-05,
326
- "loss": 0.0305,
327
  "step": 4700
328
  },
329
  {
330
- "epoch": 25.35,
331
- "learning_rate": 2.885361552028219e-05,
332
- "loss": 0.0294,
333
  "step": 4800
334
  },
335
  {
336
- "epoch": 25.87,
337
- "learning_rate": 2.8412698412698414e-05,
338
- "loss": 0.0339,
339
  "step": 4900
340
  },
341
  {
342
- "epoch": 26.4,
343
- "learning_rate": 2.797178130511464e-05,
344
- "loss": 0.0189,
345
  "step": 5000
346
  },
347
  {
348
- "epoch": 26.4,
349
- "eval_accuracy": 0.9113036394119263,
350
- "eval_loss": 0.42735978960990906,
351
- "eval_runtime": 49.4145,
352
- "eval_samples_per_second": 49.054,
353
- "eval_steps_per_second": 6.132,
354
  "step": 5000
355
  },
356
  {
357
- "epoch": 26.93,
358
- "learning_rate": 2.7530864197530864e-05,
359
- "loss": 0.0285,
360
  "step": 5100
361
  },
362
  {
363
- "epoch": 27.46,
364
- "learning_rate": 2.7089947089947094e-05,
365
- "loss": 0.0307,
366
  "step": 5200
367
  },
368
  {
369
- "epoch": 27.99,
370
- "learning_rate": 2.6649029982363318e-05,
371
- "loss": 0.0269,
372
  "step": 5300
373
  },
374
  {
375
- "epoch": 28.51,
376
- "learning_rate": 2.6208112874779544e-05,
377
- "loss": 0.0292,
378
  "step": 5400
379
  },
380
  {
381
- "epoch": 29.04,
382
- "learning_rate": 2.5767195767195768e-05,
383
- "loss": 0.032,
384
  "step": 5500
385
  },
386
  {
387
- "epoch": 29.57,
388
- "learning_rate": 2.5326278659611995e-05,
389
- "loss": 0.0297,
390
  "step": 5600
391
  },
392
  {
393
- "epoch": 30.1,
394
- "learning_rate": 2.4885361552028218e-05,
395
- "loss": 0.027,
396
  "step": 5700
397
  },
398
  {
399
- "epoch": 30.63,
400
- "learning_rate": 2.4444444444444445e-05,
401
- "loss": 0.0258,
402
  "step": 5800
403
  },
404
  {
405
- "epoch": 31.16,
406
- "learning_rate": 2.4003527336860672e-05,
407
- "loss": 0.0237,
408
  "step": 5900
409
  },
410
  {
411
- "epoch": 31.68,
412
- "learning_rate": 2.3562610229276895e-05,
413
- "loss": 0.0187,
414
  "step": 6000
415
  },
416
  {
417
- "epoch": 31.68,
418
- "eval_accuracy": 0.9100660085678101,
419
- "eval_loss": 0.4131234884262085,
420
- "eval_runtime": 49.5018,
421
- "eval_samples_per_second": 48.968,
422
- "eval_steps_per_second": 6.121,
423
  "step": 6000
424
  },
425
  {
426
- "epoch": 32.21,
427
- "learning_rate": 2.3121693121693122e-05,
428
- "loss": 0.0255,
429
  "step": 6100
430
  },
431
  {
432
- "epoch": 32.74,
433
- "learning_rate": 2.268077601410935e-05,
434
- "loss": 0.0244,
435
  "step": 6200
436
  },
437
  {
438
- "epoch": 33.27,
439
- "learning_rate": 2.2239858906525572e-05,
440
- "loss": 0.0273,
441
  "step": 6300
442
  },
443
  {
444
- "epoch": 33.8,
445
- "learning_rate": 2.17989417989418e-05,
446
- "loss": 0.0214,
447
  "step": 6400
448
  },
449
  {
450
- "epoch": 34.32,
451
- "learning_rate": 2.1358024691358026e-05,
452
- "loss": 0.0265,
453
  "step": 6500
454
  },
455
  {
456
- "epoch": 34.85,
457
- "learning_rate": 2.091710758377425e-05,
458
- "loss": 0.0276,
459
  "step": 6600
460
  },
461
  {
462
- "epoch": 35.38,
463
- "learning_rate": 2.0476190476190476e-05,
464
- "loss": 0.0153,
465
  "step": 6700
466
  },
467
  {
468
- "epoch": 35.91,
469
- "learning_rate": 2.0035273368606703e-05,
470
- "loss": 0.0246,
471
  "step": 6800
472
  },
473
  {
474
- "epoch": 36.44,
475
- "learning_rate": 1.959435626102293e-05,
476
- "loss": 0.0266,
477
  "step": 6900
478
  },
479
  {
480
- "epoch": 36.96,
481
- "learning_rate": 1.9153439153439153e-05,
482
- "loss": 0.0203,
483
  "step": 7000
484
  },
485
  {
486
- "epoch": 36.96,
487
- "eval_accuracy": 0.9236798882484436,
488
- "eval_loss": 0.3643423020839691,
489
- "eval_runtime": 49.3507,
490
- "eval_samples_per_second": 49.118,
491
- "eval_steps_per_second": 6.14,
492
  "step": 7000
493
  },
494
  {
495
- "epoch": 37.49,
496
- "learning_rate": 1.871252204585538e-05,
497
- "loss": 0.0225,
498
  "step": 7100
499
  },
500
  {
501
- "epoch": 38.02,
502
- "learning_rate": 1.8271604938271607e-05,
503
- "loss": 0.0296,
504
  "step": 7200
505
  },
506
  {
507
- "epoch": 38.55,
508
- "learning_rate": 1.783068783068783e-05,
509
- "loss": 0.0181,
510
  "step": 7300
511
  },
512
  {
513
- "epoch": 39.08,
514
- "learning_rate": 1.7389770723104057e-05,
515
- "loss": 0.0184,
516
  "step": 7400
517
  },
518
  {
519
- "epoch": 39.6,
520
- "learning_rate": 1.6948853615520284e-05,
521
- "loss": 0.0191,
522
  "step": 7500
523
  },
524
  {
525
- "epoch": 40.13,
526
- "learning_rate": 1.6507936507936507e-05,
527
- "loss": 0.0224,
528
  "step": 7600
529
  },
530
  {
531
- "epoch": 40.66,
532
- "learning_rate": 1.6067019400352734e-05,
533
- "loss": 0.0161,
534
  "step": 7700
535
  },
536
  {
537
- "epoch": 41.19,
538
- "learning_rate": 1.562610229276896e-05,
539
- "loss": 0.0211,
540
  "step": 7800
541
  },
542
  {
543
- "epoch": 41.72,
544
- "learning_rate": 1.5185185185185186e-05,
545
- "loss": 0.0165,
546
  "step": 7900
547
  },
548
  {
549
- "epoch": 42.24,
550
- "learning_rate": 1.4744268077601411e-05,
551
- "loss": 0.0147,
552
  "step": 8000
553
  },
554
  {
555
- "epoch": 42.24,
556
- "eval_accuracy": 0.9294554591178894,
557
- "eval_loss": 0.3574332892894745,
558
- "eval_runtime": 49.7962,
559
- "eval_samples_per_second": 48.678,
560
- "eval_steps_per_second": 6.085,
561
  "step": 8000
562
  },
563
  {
564
- "epoch": 42.77,
565
- "learning_rate": 1.4303350970017638e-05,
566
- "loss": 0.0235,
567
  "step": 8100
568
  },
569
  {
570
- "epoch": 43.3,
571
- "learning_rate": 1.3862433862433863e-05,
572
- "loss": 0.0207,
573
  "step": 8200
574
  },
575
  {
576
- "epoch": 43.83,
577
- "learning_rate": 1.3421516754850088e-05,
578
- "loss": 0.0139,
579
  "step": 8300
580
  },
581
  {
582
- "epoch": 44.36,
583
- "learning_rate": 1.2980599647266315e-05,
584
- "loss": 0.0168,
585
  "step": 8400
586
  },
587
  {
588
- "epoch": 44.88,
589
- "learning_rate": 1.253968253968254e-05,
590
- "loss": 0.0146,
591
  "step": 8500
592
  },
593
  {
594
- "epoch": 45.41,
595
- "learning_rate": 1.2098765432098767e-05,
596
- "loss": 0.0149,
597
  "step": 8600
598
  },
599
  {
600
- "epoch": 45.94,
601
- "learning_rate": 1.1657848324514992e-05,
602
- "loss": 0.0155,
603
  "step": 8700
604
  },
605
  {
606
- "epoch": 46.47,
607
- "learning_rate": 1.1221340388007055e-05,
608
- "loss": 0.0196,
609
  "step": 8800
610
  },
611
  {
612
- "epoch": 47.0,
613
- "learning_rate": 1.0780423280423282e-05,
614
- "loss": 0.0229,
615
  "step": 8900
616
  },
617
  {
618
- "epoch": 47.52,
619
- "learning_rate": 1.0339506172839507e-05,
620
- "loss": 0.0148,
621
  "step": 9000
622
  },
623
  {
624
- "epoch": 47.52,
625
- "eval_accuracy": 0.9220296740531921,
626
- "eval_loss": 0.36532989144325256,
627
- "eval_runtime": 50.0277,
628
- "eval_samples_per_second": 48.453,
629
- "eval_steps_per_second": 6.057,
630
  "step": 9000
631
  },
632
  {
633
- "epoch": 48.05,
634
- "learning_rate": 9.898589065255732e-06,
635
- "loss": 0.0133,
636
  "step": 9100
637
  },
638
  {
639
- "epoch": 48.58,
640
- "learning_rate": 9.457671957671959e-06,
641
- "loss": 0.0131,
642
  "step": 9200
643
  },
644
  {
645
- "epoch": 49.11,
646
- "learning_rate": 9.016754850088184e-06,
647
- "loss": 0.0121,
648
  "step": 9300
649
  },
650
  {
651
- "epoch": 49.64,
652
- "learning_rate": 8.575837742504409e-06,
653
- "loss": 0.0168,
654
  "step": 9400
655
  },
656
  {
657
- "epoch": 50.17,
658
- "learning_rate": 8.134920634920636e-06,
659
- "loss": 0.0148,
660
- "step": 9500
 
 
 
661
  },
662
  {
663
- "epoch": 50.69,
664
- "learning_rate": 7.694003527336861e-06,
665
- "loss": 0.0129,
666
- "step": 9600
667
- },
668
- {
669
- "epoch": 51.22,
670
- "learning_rate": 7.253086419753087e-06,
671
- "loss": 0.012,
672
- "step": 9700
673
- },
674
- {
675
- "epoch": 51.75,
676
- "learning_rate": 6.812169312169313e-06,
677
- "loss": 0.0112,
678
- "step": 9800
679
- },
680
- {
681
- "epoch": 52.28,
682
- "learning_rate": 6.371252204585539e-06,
683
- "loss": 0.0193,
684
- "step": 9900
685
- },
686
- {
687
- "epoch": 52.81,
688
- "learning_rate": 5.930335097001764e-06,
689
- "loss": 0.0137,
690
- "step": 10000
691
- },
692
- {
693
- "epoch": 52.81,
694
- "eval_accuracy": 0.9352310299873352,
695
- "eval_loss": 0.3257134258747101,
696
- "eval_runtime": 50.886,
697
- "eval_samples_per_second": 47.636,
698
- "eval_steps_per_second": 5.954,
699
- "step": 10000
700
- },
701
- {
702
- "epoch": 53.33,
703
- "learning_rate": 5.489417989417989e-06,
704
- "loss": 0.0171,
705
- "step": 10100
706
- },
707
- {
708
- "epoch": 53.86,
709
- "learning_rate": 5.048500881834215e-06,
710
- "loss": 0.0169,
711
- "step": 10200
712
- },
713
- {
714
- "epoch": 54.39,
715
- "learning_rate": 4.611992945326279e-06,
716
- "loss": 0.0136,
717
- "step": 10300
718
- },
719
- {
720
- "epoch": 54.92,
721
- "learning_rate": 4.171075837742505e-06,
722
- "loss": 0.0152,
723
- "step": 10400
724
- },
725
- {
726
- "epoch": 55.45,
727
- "learning_rate": 3.7301587301587305e-06,
728
- "loss": 0.015,
729
- "step": 10500
730
- },
731
- {
732
- "epoch": 55.97,
733
- "learning_rate": 3.289241622574956e-06,
734
- "loss": 0.0136,
735
- "step": 10600
736
- },
737
- {
738
- "epoch": 56.5,
739
- "learning_rate": 2.848324514991182e-06,
740
- "loss": 0.0156,
741
- "step": 10700
742
- },
743
- {
744
- "epoch": 57.03,
745
- "learning_rate": 2.4074074074074075e-06,
746
- "loss": 0.0137,
747
- "step": 10800
748
- },
749
- {
750
- "epoch": 57.56,
751
- "learning_rate": 1.9664902998236335e-06,
752
- "loss": 0.0147,
753
- "step": 10900
754
- },
755
- {
756
- "epoch": 58.09,
757
- "learning_rate": 1.525573192239859e-06,
758
- "loss": 0.0174,
759
- "step": 11000
760
- },
761
- {
762
- "epoch": 58.09,
763
- "eval_accuracy": 0.933993399143219,
764
- "eval_loss": 0.30968689918518066,
765
- "eval_runtime": 51.0931,
766
- "eval_samples_per_second": 47.443,
767
- "eval_steps_per_second": 5.93,
768
- "step": 11000
769
- },
770
- {
771
- "epoch": 58.61,
772
- "learning_rate": 1.0846560846560847e-06,
773
- "loss": 0.0163,
774
- "step": 11100
775
- },
776
- {
777
- "epoch": 59.14,
778
- "learning_rate": 6.437389770723105e-07,
779
- "loss": 0.0096,
780
- "step": 11200
781
- },
782
- {
783
- "epoch": 59.67,
784
- "learning_rate": 2.0282186948853617e-07,
785
- "loss": 0.0121,
786
- "step": 11300
787
- },
788
- {
789
- "epoch": 59.88,
790
- "step": 11340,
791
- "total_flos": 2.20514636224512e+19,
792
- "train_loss": 0.030625741817122836,
793
- "train_runtime": 25244.9182,
794
- "train_samples_per_second": 28.806,
795
- "train_steps_per_second": 0.449
796
- },
797
- {
798
- "epoch": 59.88,
799
- "eval_accuracy": 0.9323432445526123,
800
- "eval_loss": 0.3117374777793884,
801
- "eval_runtime": 50.8983,
802
- "eval_samples_per_second": 47.624,
803
- "eval_steps_per_second": 5.953,
804
- "step": 11340
805
  }
806
  ],
807
  "logging_steps": 100,
808
- "max_steps": 11340,
809
- "num_train_epochs": 60,
810
  "save_steps": 2000,
811
- "total_flos": 2.20514636224512e+19,
812
  "trial_name": null,
813
  "trial_params": null
814
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 99.2084432717678,
5
  "eval_steps": 1000,
6
+ "global_step": 9400,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 1.06,
13
+ "learning_rate": 4.946808510638298e-05,
14
+ "loss": 0.0411,
15
  "step": 100
16
  },
17
  {
18
+ "epoch": 2.11,
19
+ "learning_rate": 4.893617021276596e-05,
20
+ "loss": 0.033,
21
  "step": 200
22
  },
23
  {
24
+ "epoch": 3.17,
25
+ "learning_rate": 4.840425531914894e-05,
26
+ "loss": 0.0286,
27
  "step": 300
28
  },
29
  {
30
+ "epoch": 4.22,
31
+ "learning_rate": 4.787234042553192e-05,
32
+ "loss": 0.0371,
33
  "step": 400
34
  },
35
  {
36
+ "epoch": 5.28,
37
+ "learning_rate": 4.734042553191489e-05,
38
+ "loss": 0.0315,
39
  "step": 500
40
  },
41
  {
42
+ "epoch": 6.33,
43
+ "learning_rate": 4.680851063829788e-05,
44
+ "loss": 0.0245,
45
  "step": 600
46
  },
47
  {
48
+ "epoch": 7.39,
49
+ "learning_rate": 4.627659574468085e-05,
50
+ "loss": 0.0301,
51
  "step": 700
52
  },
53
  {
54
+ "epoch": 8.44,
55
+ "learning_rate": 4.574468085106383e-05,
56
+ "loss": 0.0276,
57
  "step": 800
58
  },
59
  {
60
+ "epoch": 9.5,
61
+ "learning_rate": 4.5212765957446815e-05,
62
+ "loss": 0.027,
63
  "step": 900
64
  },
65
  {
66
+ "epoch": 10.55,
67
+ "learning_rate": 4.468085106382979e-05,
68
+ "loss": 0.0272,
69
  "step": 1000
70
  },
71
  {
72
+ "epoch": 10.55,
73
+ "eval_accuracy": 0.9257425665855408,
74
+ "eval_loss": 0.2915326654911041,
75
+ "eval_runtime": 50.3973,
76
+ "eval_samples_per_second": 48.098,
77
+ "eval_steps_per_second": 6.012,
78
  "step": 1000
79
  },
80
  {
81
+ "epoch": 11.61,
82
+ "learning_rate": 4.414893617021277e-05,
83
+ "loss": 0.0235,
84
  "step": 1100
85
  },
86
  {
87
+ "epoch": 12.66,
88
+ "learning_rate": 4.3617021276595746e-05,
89
+ "loss": 0.026,
90
  "step": 1200
91
  },
92
  {
93
+ "epoch": 13.72,
94
+ "learning_rate": 4.3085106382978725e-05,
95
+ "loss": 0.0274,
96
  "step": 1300
97
  },
98
  {
99
+ "epoch": 14.78,
100
+ "learning_rate": 4.2558510638297876e-05,
101
+ "loss": 0.0213,
102
  "step": 1400
103
  },
104
  {
105
+ "epoch": 15.83,
106
+ "learning_rate": 4.2026595744680855e-05,
107
+ "loss": 0.0235,
108
  "step": 1500
109
  },
110
  {
111
+ "epoch": 16.89,
112
+ "learning_rate": 4.1494680851063834e-05,
113
+ "loss": 0.0269,
114
  "step": 1600
115
  },
116
  {
117
+ "epoch": 17.94,
118
+ "learning_rate": 4.096276595744681e-05,
119
+ "loss": 0.0271,
120
  "step": 1700
121
  },
122
  {
123
+ "epoch": 19.0,
124
+ "learning_rate": 4.0436170212765964e-05,
125
+ "loss": 0.0222,
126
  "step": 1800
127
  },
128
  {
129
+ "epoch": 20.05,
130
+ "learning_rate": 3.990425531914894e-05,
131
+ "loss": 0.0213,
132
  "step": 1900
133
  },
134
  {
135
+ "epoch": 21.11,
136
+ "learning_rate": 3.9372340425531916e-05,
137
+ "loss": 0.0172,
138
  "step": 2000
139
  },
140
  {
141
+ "epoch": 21.11,
142
+ "eval_accuracy": 0.9331682920455933,
143
+ "eval_loss": 0.27688100934028625,
144
+ "eval_runtime": 51.2178,
145
+ "eval_samples_per_second": 47.327,
146
+ "eval_steps_per_second": 5.916,
147
  "step": 2000
148
  },
149
  {
150
+ "epoch": 22.16,
151
+ "learning_rate": 3.8840425531914895e-05,
152
+ "loss": 0.0252,
153
  "step": 2100
154
  },
155
  {
156
+ "epoch": 23.22,
157
+ "learning_rate": 3.8308510638297874e-05,
158
+ "loss": 0.0294,
159
  "step": 2200
160
  },
161
  {
162
+ "epoch": 24.27,
163
+ "learning_rate": 3.7776595744680853e-05,
164
+ "loss": 0.0249,
165
  "step": 2300
166
  },
167
  {
168
+ "epoch": 25.33,
169
+ "learning_rate": 3.7244680851063826e-05,
170
+ "loss": 0.0252,
171
  "step": 2400
172
  },
173
  {
174
+ "epoch": 26.39,
175
+ "learning_rate": 3.671276595744681e-05,
176
+ "loss": 0.0203,
177
  "step": 2500
178
  },
179
  {
180
+ "epoch": 27.44,
181
+ "learning_rate": 3.618085106382979e-05,
182
+ "loss": 0.0195,
183
  "step": 2600
184
  },
185
  {
186
+ "epoch": 28.5,
187
+ "learning_rate": 3.5648936170212764e-05,
188
+ "loss": 0.0209,
189
  "step": 2700
190
  },
191
  {
192
+ "epoch": 29.55,
193
+ "learning_rate": 3.511702127659575e-05,
194
+ "loss": 0.023,
195
  "step": 2800
196
  },
197
  {
198
+ "epoch": 30.61,
199
+ "learning_rate": 3.458510638297873e-05,
200
+ "loss": 0.0173,
201
  "step": 2900
202
  },
203
  {
204
+ "epoch": 31.66,
205
+ "learning_rate": 3.40531914893617e-05,
206
+ "loss": 0.0178,
207
  "step": 3000
208
  },
209
  {
210
+ "epoch": 31.66,
211
+ "eval_accuracy": 0.9323432445526123,
212
+ "eval_loss": 0.29276978969573975,
213
+ "eval_runtime": 51.5058,
214
+ "eval_samples_per_second": 47.063,
215
+ "eval_steps_per_second": 5.883,
216
  "step": 3000
217
  },
218
  {
219
+ "epoch": 32.72,
220
+ "learning_rate": 3.352127659574468e-05,
221
+ "loss": 0.0191,
222
  "step": 3100
223
  },
224
  {
225
+ "epoch": 33.77,
226
+ "learning_rate": 3.2989361702127666e-05,
227
+ "loss": 0.0226,
228
  "step": 3200
229
  },
230
  {
231
+ "epoch": 34.83,
232
+ "learning_rate": 3.245744680851064e-05,
233
+ "loss": 0.0197,
234
  "step": 3300
235
  },
236
  {
237
+ "epoch": 35.88,
238
+ "learning_rate": 3.192553191489362e-05,
239
+ "loss": 0.0246,
240
  "step": 3400
241
  },
242
  {
243
+ "epoch": 36.94,
244
+ "learning_rate": 3.13936170212766e-05,
245
+ "loss": 0.0183,
246
  "step": 3500
247
  },
248
  {
249
+ "epoch": 37.99,
250
+ "learning_rate": 3.0861702127659576e-05,
251
+ "loss": 0.0161,
252
  "step": 3600
253
  },
254
  {
255
+ "epoch": 39.05,
256
+ "learning_rate": 3.0329787234042556e-05,
257
+ "loss": 0.0156,
258
  "step": 3700
259
  },
260
  {
261
+ "epoch": 40.11,
262
+ "learning_rate": 2.979787234042553e-05,
263
+ "loss": 0.014,
264
  "step": 3800
265
  },
266
  {
267
+ "epoch": 41.16,
268
+ "learning_rate": 2.926595744680851e-05,
269
+ "loss": 0.0138,
270
  "step": 3900
271
  },
272
  {
273
+ "epoch": 42.22,
274
+ "learning_rate": 2.8734042553191493e-05,
275
+ "loss": 0.0145,
276
  "step": 4000
277
  },
278
  {
279
+ "epoch": 42.22,
280
+ "eval_accuracy": 0.9356435537338257,
281
+ "eval_loss": 0.27180883288383484,
282
+ "eval_runtime": 50.6123,
283
+ "eval_samples_per_second": 47.894,
284
+ "eval_steps_per_second": 5.987,
285
  "step": 4000
286
  },
287
  {
288
+ "epoch": 43.27,
289
+ "learning_rate": 2.820212765957447e-05,
290
+ "loss": 0.0176,
291
  "step": 4100
292
  },
293
  {
294
+ "epoch": 44.33,
295
+ "learning_rate": 2.7670212765957448e-05,
296
+ "loss": 0.0135,
297
  "step": 4200
298
  },
299
  {
300
+ "epoch": 45.38,
301
+ "learning_rate": 2.713829787234043e-05,
302
+ "loss": 0.0179,
303
  "step": 4300
304
  },
305
  {
306
+ "epoch": 46.44,
307
+ "learning_rate": 2.6606382978723403e-05,
308
+ "loss": 0.0164,
309
  "step": 4400
310
  },
311
  {
312
+ "epoch": 47.49,
313
+ "learning_rate": 2.6074468085106386e-05,
314
+ "loss": 0.019,
315
  "step": 4500
316
  },
317
  {
318
+ "epoch": 48.55,
319
+ "learning_rate": 2.5542553191489365e-05,
320
+ "loss": 0.0136,
321
  "step": 4600
322
  },
323
  {
324
+ "epoch": 49.6,
325
+ "learning_rate": 2.501063829787234e-05,
326
+ "loss": 0.0159,
327
  "step": 4700
328
  },
329
  {
330
+ "epoch": 50.66,
331
+ "learning_rate": 2.447872340425532e-05,
332
+ "loss": 0.0163,
333
  "step": 4800
334
  },
335
  {
336
+ "epoch": 51.72,
337
+ "learning_rate": 2.39468085106383e-05,
338
+ "loss": 0.0148,
339
  "step": 4900
340
  },
341
  {
342
+ "epoch": 52.77,
343
+ "learning_rate": 2.341489361702128e-05,
344
+ "loss": 0.0147,
345
  "step": 5000
346
  },
347
  {
348
+ "epoch": 52.77,
349
+ "eval_accuracy": 0.9348185062408447,
350
+ "eval_loss": 0.26764675974845886,
351
+ "eval_runtime": 49.7336,
352
+ "eval_samples_per_second": 48.74,
353
+ "eval_steps_per_second": 6.092,
354
  "step": 5000
355
  },
356
  {
357
+ "epoch": 53.83,
358
+ "learning_rate": 2.2882978723404254e-05,
359
+ "loss": 0.0127,
360
  "step": 5100
361
  },
362
  {
363
+ "epoch": 54.88,
364
+ "learning_rate": 2.2351063829787237e-05,
365
+ "loss": 0.014,
366
  "step": 5200
367
  },
368
  {
369
+ "epoch": 55.94,
370
+ "learning_rate": 2.1819148936170213e-05,
371
+ "loss": 0.0142,
372
  "step": 5300
373
  },
374
  {
375
+ "epoch": 56.99,
376
+ "learning_rate": 2.1287234042553192e-05,
377
+ "loss": 0.016,
378
  "step": 5400
379
  },
380
  {
381
+ "epoch": 58.05,
382
+ "learning_rate": 2.075531914893617e-05,
383
+ "loss": 0.0148,
384
  "step": 5500
385
  },
386
  {
387
+ "epoch": 59.1,
388
+ "learning_rate": 2.022340425531915e-05,
389
+ "loss": 0.0125,
390
  "step": 5600
391
  },
392
  {
393
+ "epoch": 60.16,
394
+ "learning_rate": 1.969148936170213e-05,
395
+ "loss": 0.0123,
396
  "step": 5700
397
  },
398
  {
399
+ "epoch": 61.21,
400
+ "learning_rate": 1.9159574468085105e-05,
401
+ "loss": 0.0153,
402
  "step": 5800
403
  },
404
  {
405
+ "epoch": 62.27,
406
+ "learning_rate": 1.8627659574468088e-05,
407
+ "loss": 0.0105,
408
  "step": 5900
409
  },
410
  {
411
+ "epoch": 63.32,
412
+ "learning_rate": 1.8095744680851064e-05,
413
+ "loss": 0.0135,
414
  "step": 6000
415
  },
416
  {
417
+ "epoch": 63.32,
418
+ "eval_accuracy": 0.9397689700126648,
419
+ "eval_loss": 0.27310383319854736,
420
+ "eval_runtime": 49.9787,
421
+ "eval_samples_per_second": 48.501,
422
+ "eval_steps_per_second": 6.063,
423
  "step": 6000
424
  },
425
  {
426
+ "epoch": 64.38,
427
+ "learning_rate": 1.7563829787234043e-05,
428
+ "loss": 0.0126,
429
  "step": 6100
430
  },
431
  {
432
+ "epoch": 65.44,
433
+ "learning_rate": 1.7031914893617022e-05,
434
+ "loss": 0.0141,
435
  "step": 6200
436
  },
437
  {
438
+ "epoch": 66.49,
439
+ "learning_rate": 1.6505319148936173e-05,
440
+ "loss": 0.0093,
441
  "step": 6300
442
  },
443
  {
444
+ "epoch": 67.55,
445
+ "learning_rate": 1.597340425531915e-05,
446
+ "loss": 0.0109,
447
  "step": 6400
448
  },
449
  {
450
+ "epoch": 68.6,
451
+ "learning_rate": 1.5441489361702128e-05,
452
+ "loss": 0.011,
453
  "step": 6500
454
  },
455
  {
456
+ "epoch": 69.66,
457
+ "learning_rate": 1.4909574468085109e-05,
458
+ "loss": 0.0127,
459
  "step": 6600
460
  },
461
  {
462
+ "epoch": 70.71,
463
+ "learning_rate": 1.4377659574468086e-05,
464
+ "loss": 0.009,
465
  "step": 6700
466
  },
467
  {
468
+ "epoch": 71.77,
469
+ "learning_rate": 1.3845744680851064e-05,
470
+ "loss": 0.0109,
471
  "step": 6800
472
  },
473
  {
474
+ "epoch": 72.82,
475
+ "learning_rate": 1.3313829787234045e-05,
476
+ "loss": 0.0114,
477
  "step": 6900
478
  },
479
  {
480
+ "epoch": 73.88,
481
+ "learning_rate": 1.2781914893617022e-05,
482
+ "loss": 0.0105,
483
  "step": 7000
484
  },
485
  {
486
+ "epoch": 73.88,
487
+ "eval_accuracy": 0.9335808753967285,
488
+ "eval_loss": 0.314525306224823,
489
+ "eval_runtime": 50.1755,
490
+ "eval_samples_per_second": 48.31,
491
+ "eval_steps_per_second": 6.039,
492
  "step": 7000
493
  },
494
  {
495
+ "epoch": 74.93,
496
+ "learning_rate": 1.225e-05,
497
+ "loss": 0.0124,
498
  "step": 7100
499
  },
500
  {
501
+ "epoch": 75.99,
502
+ "learning_rate": 1.171808510638298e-05,
503
+ "loss": 0.0091,
504
  "step": 7200
505
  },
506
  {
507
+ "epoch": 77.04,
508
+ "learning_rate": 1.1186170212765958e-05,
509
+ "loss": 0.0118,
510
  "step": 7300
511
  },
512
  {
513
+ "epoch": 78.1,
514
+ "learning_rate": 1.0654255319148937e-05,
515
+ "loss": 0.0098,
516
  "step": 7400
517
  },
518
  {
519
+ "epoch": 79.16,
520
+ "learning_rate": 1.0122340425531915e-05,
521
+ "loss": 0.01,
522
  "step": 7500
523
  },
524
  {
525
+ "epoch": 80.21,
526
+ "learning_rate": 9.590425531914894e-06,
527
+ "loss": 0.0103,
528
  "step": 7600
529
  },
530
  {
531
+ "epoch": 81.27,
532
+ "learning_rate": 9.058510638297871e-06,
533
+ "loss": 0.0071,
534
  "step": 7700
535
  },
536
  {
537
+ "epoch": 82.32,
538
+ "learning_rate": 8.52659574468085e-06,
539
+ "loss": 0.0114,
540
  "step": 7800
541
  },
542
  {
543
+ "epoch": 83.38,
544
+ "learning_rate": 7.994680851063832e-06,
545
+ "loss": 0.0094,
546
  "step": 7900
547
  },
548
  {
549
+ "epoch": 84.43,
550
+ "learning_rate": 7.462765957446809e-06,
551
+ "loss": 0.0075,
552
  "step": 8000
553
  },
554
  {
555
+ "epoch": 84.43,
556
+ "eval_accuracy": 0.9319307208061218,
557
+ "eval_loss": 0.29711028933525085,
558
+ "eval_runtime": 50.3531,
559
+ "eval_samples_per_second": 48.14,
560
+ "eval_steps_per_second": 6.018,
561
  "step": 8000
562
  },
563
  {
564
+ "epoch": 85.49,
565
+ "learning_rate": 6.930851063829788e-06,
566
+ "loss": 0.0089,
567
  "step": 8100
568
  },
569
  {
570
+ "epoch": 86.54,
571
+ "learning_rate": 6.398936170212766e-06,
572
+ "loss": 0.0096,
573
  "step": 8200
574
  },
575
  {
576
+ "epoch": 87.6,
577
+ "learning_rate": 5.867021276595745e-06,
578
+ "loss": 0.0095,
579
  "step": 8300
580
  },
581
  {
582
+ "epoch": 88.65,
583
+ "learning_rate": 5.335106382978724e-06,
584
+ "loss": 0.0143,
585
  "step": 8400
586
  },
587
  {
588
+ "epoch": 89.71,
589
+ "learning_rate": 4.8031914893617025e-06,
590
+ "loss": 0.0097,
591
  "step": 8500
592
  },
593
  {
594
+ "epoch": 90.77,
595
+ "learning_rate": 4.271276595744681e-06,
596
+ "loss": 0.0094,
597
  "step": 8600
598
  },
599
  {
600
+ "epoch": 91.82,
601
+ "learning_rate": 3.7393617021276596e-06,
602
+ "loss": 0.0088,
603
  "step": 8700
604
  },
605
  {
606
+ "epoch": 92.88,
607
+ "learning_rate": 3.2074468085106384e-06,
608
+ "loss": 0.0052,
609
  "step": 8800
610
  },
611
  {
612
+ "epoch": 93.93,
613
+ "learning_rate": 2.6755319148936168e-06,
614
+ "loss": 0.0102,
615
  "step": 8900
616
  },
617
  {
618
+ "epoch": 94.99,
619
+ "learning_rate": 2.143617021276596e-06,
620
+ "loss": 0.0078,
621
  "step": 9000
622
  },
623
  {
624
+ "epoch": 94.99,
625
+ "eval_accuracy": 0.9327557682991028,
626
+ "eval_loss": 0.2949957847595215,
627
+ "eval_runtime": 51.5836,
628
+ "eval_samples_per_second": 46.992,
629
+ "eval_steps_per_second": 5.874,
630
  "step": 9000
631
  },
632
  {
633
+ "epoch": 96.04,
634
+ "learning_rate": 1.6117021276595745e-06,
635
+ "loss": 0.0066,
636
  "step": 9100
637
  },
638
  {
639
+ "epoch": 97.1,
640
+ "learning_rate": 1.0797872340425531e-06,
641
+ "loss": 0.0081,
642
  "step": 9200
643
  },
644
  {
645
+ "epoch": 98.15,
646
+ "learning_rate": 5.478723404255319e-07,
647
+ "loss": 0.0094,
648
  "step": 9300
649
  },
650
  {
651
+ "epoch": 99.21,
652
+ "learning_rate": 1.595744680851064e-08,
653
+ "loss": 0.0095,
654
  "step": 9400
655
  },
656
  {
657
+ "epoch": 99.21,
658
+ "step": 9400,
659
+ "total_flos": 3.653391792237703e+19,
660
+ "train_loss": 0.01680715578667661,
661
+ "train_runtime": 36294.494,
662
+ "train_samples_per_second": 33.393,
663
+ "train_steps_per_second": 0.259
664
  },
665
  {
666
+ "epoch": 99.21,
667
+ "eval_accuracy": 0.9319307208061218,
668
+ "eval_loss": 0.29647526144981384,
669
+ "eval_runtime": 50.5435,
670
+ "eval_samples_per_second": 47.959,
671
+ "eval_steps_per_second": 5.995,
672
+ "step": 9400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
673
  }
674
  ],
675
  "logging_steps": 100,
676
+ "max_steps": 9400,
677
+ "num_train_epochs": 100,
678
  "save_steps": 2000,
679
+ "total_flos": 3.653391792237703e+19,
680
  "trial_name": null,
681
  "trial_params": null
682
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:34a256b8a5d2d883823dcb395af53cbfffa6c0546324fd46a6cbd61fd66d4518
3
  size 4155
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26c98e7aa7407b828227bd572ea7475f0910a407b91381f0d71f441503edc41e
3
  size 4155