arampacha commited on
Commit
9ff1856
1 Parent(s): f3f904f
all_results.json CHANGED
@@ -1,15 +1,15 @@
1
  {
2
- "epoch": 64.51,
3
- "eval_cer": 0.034204238520673176,
4
- "eval_loss": 0.12852737307548523,
5
- "eval_runtime": 199.2512,
6
  "eval_samples": 5802,
7
- "eval_samples_per_second": 29.119,
8
- "eval_steps_per_second": 0.457,
9
- "eval_wer": 0.18207560526688377,
10
- "train_loss": 0.5247637950897217,
11
- "train_runtime": 62085.4852,
12
- "train_samples": 19948,
13
- "train_samples_per_second": 20.617,
14
- "train_steps_per_second": 0.161
15
  }
 
1
  {
2
+ "epoch": 38.59,
3
+ "eval_cer": 0.032260099136267845,
4
+ "eval_loss": 0.10924588888883591,
5
+ "eval_runtime": 200.0507,
6
  "eval_samples": 5802,
7
+ "eval_samples_per_second": 29.003,
8
+ "eval_steps_per_second": 0.455,
9
+ "eval_wer": 0.17520883477275945,
10
+ "train_loss": 1.0786900800069172,
11
+ "train_runtime": 133237.4383,
12
+ "train_samples": 39803,
13
+ "train_samples_per_second": 11.528,
14
+ "train_steps_per_second": 0.09
15
  }
config.json CHANGED
@@ -64,7 +64,7 @@
64
  "mask_feature_prob": 0.25,
65
  "mask_time_length": 10,
66
  "mask_time_min_masks": 2,
67
- "mask_time_prob": 0.55,
68
  "model_type": "wav2vec2",
69
  "num_adapter_layers": 3,
70
  "num_attention_heads": 16,
 
64
  "mask_feature_prob": 0.25,
65
  "mask_time_length": 10,
66
  "mask_time_min_masks": 2,
67
+ "mask_time_prob": 0.7,
68
  "model_type": "wav2vec2",
69
  "num_adapter_layers": 3,
70
  "num_attention_heads": 16,
eval_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "epoch": 64.51,
3
- "eval_cer": 0.034204238520673176,
4
- "eval_loss": 0.12852737307548523,
5
- "eval_runtime": 199.2512,
6
  "eval_samples": 5802,
7
- "eval_samples_per_second": 29.119,
8
- "eval_steps_per_second": 0.457,
9
- "eval_wer": 0.18207560526688377
10
  }
 
1
  {
2
+ "epoch": 38.59,
3
+ "eval_cer": 0.032260099136267845,
4
+ "eval_loss": 0.10924588888883591,
5
+ "eval_runtime": 200.0507,
6
  "eval_samples": 5802,
7
+ "eval_samples_per_second": 29.003,
8
+ "eval_steps_per_second": 0.455,
9
+ "eval_wer": 0.17520883477275945
10
  }
log_mozilla-foundation_common_voice_8_0_uk_test_predictions.txt CHANGED
The diff for this file is too large to render. See raw diff
 
mozilla-foundation_common_voice_8_0_uk_test_eval_results.txt CHANGED
@@ -1,2 +1,2 @@
1
- WER: 0.11251120864599556
2
- CER: 0.02284734795642753
 
1
+ WER: 0.10406342913776016
2
+ CER: 0.020387492208601702
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e7ce482acb12c018f40d2be671022e2737f85e54bfc9edacc3c203ce66819ee3
3
  size 3850512561
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0899f4d43292dcdf2428937a08f0be20d919e37589599d1f33fbd667896e6296
3
  size 3850512561
run.sh CHANGED
@@ -1,9 +1,10 @@
1
  python run_speech_recognition_ctc.py \
2
- --dataset_name /workspace/data/uk/noizy_student_1/ \
3
  --train_split_name train \
4
  --model_name_or_path="facebook/wav2vec2-xls-r-1b" \
5
- --output_dir="./" \
6
- --max_steps 10000 \
 
7
  --per_device_train_batch_size="16" \
8
  --per_device_eval_batch_size="64" \
9
  --gradient_accumulation_steps="8" \
@@ -18,12 +19,12 @@ python run_speech_recognition_ctc.py \
18
  --save_steps="500" \
19
  --eval_steps="500" \
20
  --logging_steps="100" \
21
- --save_total_limit 5 \
22
  --freeze_feature_encoder \
23
  --layerdrop="0.1" \
24
  --activation_dropout="0.1" \
25
  --feat_proj_dropout="0.0" \
26
- --mask_time_prob="0.55" \
27
  --mask_time_length="10" \
28
  --mask_feature_prob="0.25" \
29
  --mask_feature_length="64" \
@@ -34,6 +35,6 @@ python run_speech_recognition_ctc.py \
34
  --do_train --do_eval \
35
  --load_best_model_at_end \
36
  --report_to all \
37
- --run_name="xlsr-uk-ns-1b-1" \
38
  --wandb_project="xlsr-uk" \
39
  --bnb --tristage_sched
 
1
  python run_speech_recognition_ctc.py \
2
+ --dataset_name /workspace/data/uk/composed_dataset/ \
3
  --train_split_name train \
4
  --model_name_or_path="facebook/wav2vec2-xls-r-1b" \
5
+ --output_dir ./ \
6
+ --overwrite_output_dir \
7
+ --max_steps 12000 \
8
  --per_device_train_batch_size="16" \
9
  --per_device_eval_batch_size="64" \
10
  --gradient_accumulation_steps="8" \
 
19
  --save_steps="500" \
20
  --eval_steps="500" \
21
  --logging_steps="100" \
22
+ --save_total_limit 10 \
23
  --freeze_feature_encoder \
24
  --layerdrop="0.1" \
25
  --activation_dropout="0.1" \
26
  --feat_proj_dropout="0.0" \
27
+ --mask_time_prob="0.7" \
28
  --mask_time_length="10" \
29
  --mask_feature_prob="0.25" \
30
  --mask_feature_length="64" \
 
35
  --do_train --do_eval \
36
  --load_best_model_at_end \
37
  --report_to all \
38
+ --run_name="xlsr-uk-1b-1" \
39
  --wandb_project="xlsr-uk" \
40
  --bnb --tristage_sched
run_speech_recognition_ctc.py CHANGED
@@ -438,7 +438,7 @@ def main():
438
  raw_datasets = DatasetDict()
439
 
440
  if training_args.do_train:
441
- if data_args.dataset_name.endswith("/"):
442
  raw_datasets["train"] = load_from_disk(f"{data_args.dataset_name}/{data_args.train_split_name}")
443
  else:
444
  raw_datasets["train"] = load_dataset(
@@ -466,7 +466,7 @@ def main():
466
  raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
467
 
468
  if training_args.do_eval:
469
- if data_args.dataset_name.endswith("/"):
470
  raw_datasets["eval"] = load_from_disk(f"{data_args.dataset_name}/{data_args.eval_split_name}")
471
  else:
472
  raw_datasets["eval"] = load_dataset(
@@ -744,7 +744,7 @@ def main():
744
  eps=training_args.adam_epsilon,
745
  )
746
  if extra_args.tristage_sched:
747
- scheduler = get_tri_stage_schedule(optimizer, training_args.max_steps)
748
  else:
749
  scheduler = None
750
  optimizers = (optimizer, scheduler)
 
438
  raw_datasets = DatasetDict()
439
 
440
  if training_args.do_train:
441
+ if os.path.isdir(data_args.dataset_name):
442
  raw_datasets["train"] = load_from_disk(f"{data_args.dataset_name}/{data_args.train_split_name}")
443
  else:
444
  raw_datasets["train"] = load_dataset(
 
466
  raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
467
 
468
  if training_args.do_eval:
469
+ if os.path.isdir(data_args.dataset_name):
470
  raw_datasets["eval"] = load_from_disk(f"{data_args.dataset_name}/{data_args.eval_split_name}")
471
  else:
472
  raw_datasets["eval"] = load_dataset(
 
744
  eps=training_args.adam_epsilon,
745
  )
746
  if extra_args.tristage_sched:
747
+ scheduler = get_tri_stage_schedule(optimizer, training_args.max_steps, ratios=[0.1,0.3,0.6])
748
  else:
749
  scheduler = None
750
  optimizers = (optimizer, scheduler)
runs/Feb05_17-03-38_job-680ae191-b2c7-4b97-adaf-cb186b6c96a6/1644080998.635567/events.out.tfevents.1644080998.job-680ae191-b2c7-4b97-adaf-cb186b6c96a6.1687777.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14269093cd8a901cf553b343358bb3a6e11d7e016a5e3089a4fbef9e16f15342
3
+ size 4769
runs/Feb05_17-03-38_job-680ae191-b2c7-4b97-adaf-cb186b6c96a6/events.out.tfevents.1644080998.job-680ae191-b2c7-4b97-adaf-cb186b6c96a6.1687777.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5aebc5b8ec89626c05af71d1ad757ef5ae42fef6c33928134292765606f21235
3
+ size 32651
runs/Feb05_17-03-38_job-680ae191-b2c7-4b97-adaf-cb186b6c96a6/events.out.tfevents.1644214441.job-680ae191-b2c7-4b97-adaf-cb186b6c96a6.1687777.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0e5e8b6387cfca950c01f1519bcfc4e021fb5b5c49e87204947a35d11ba276b
3
+ size 405
special_tokens_map.json CHANGED
@@ -1 +1 @@
1
- {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
speech-recognition-community-v2_dev_data_uk_validation_eval_results.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ WER: 0.39531368102796677
2
+ CER: 0.20685439144902693
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 64.51,
3
- "train_loss": 0.5247637950897217,
4
- "train_runtime": 62085.4852,
5
- "train_samples": 19948,
6
- "train_samples_per_second": 20.617,
7
- "train_steps_per_second": 0.161
8
  }
 
1
  {
2
+ "epoch": 38.59,
3
+ "train_loss": 1.0786900800069172,
4
+ "train_runtime": 133237.4383,
5
+ "train_samples": 39803,
6
+ "train_samples_per_second": 11.528,
7
+ "train_steps_per_second": 0.09
8
  }
trainer_state.json CHANGED
@@ -1,825 +1,985 @@
1
  {
2
- "best_metric": 0.12852737307548523,
3
- "best_model_checkpoint": "./checkpoint-10000",
4
- "epoch": 64.51323175621492,
5
- "global_step": 10000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
- "epoch": 0.64,
12
- "learning_rate": 8.4824e-06,
13
- "loss": 6.9402,
14
  "step": 100
15
  },
16
  {
17
- "epoch": 1.29,
18
- "learning_rate": 1.6402400000000004e-05,
19
- "loss": 3.2608,
20
  "step": 200
21
  },
22
  {
23
- "epoch": 1.93,
24
- "learning_rate": 2.4322400000000003e-05,
25
- "loss": 2.4251,
26
  "step": 300
27
  },
28
  {
29
- "epoch": 2.58,
30
- "learning_rate": 3.22424e-05,
31
- "loss": 1.4757,
32
  "step": 400
33
  },
34
  {
35
- "epoch": 3.22,
36
- "learning_rate": 4.01624e-05,
37
- "loss": 1.2323,
38
  "step": 500
39
  },
40
  {
41
- "epoch": 3.22,
42
- "eval_cer": 0.07965777210531003,
43
- "eval_loss": 0.28158774971961975,
44
- "eval_runtime": 197.0167,
45
- "eval_samples_per_second": 29.449,
46
- "eval_steps_per_second": 0.462,
47
- "eval_wer": 0.41325687856906884,
48
  "step": 500
49
  },
50
  {
51
- "epoch": 3.87,
52
- "learning_rate": 4.80824e-05,
53
- "loss": 1.1608,
54
  "step": 600
55
  },
56
  {
57
- "epoch": 4.51,
58
- "learning_rate": 5.600240000000001e-05,
59
- "loss": 1.1297,
60
  "step": 700
61
  },
62
  {
63
- "epoch": 5.16,
64
- "learning_rate": 6.39224e-05,
65
- "loss": 1.0738,
66
  "step": 800
67
  },
68
  {
69
- "epoch": 5.8,
70
- "learning_rate": 7.184240000000001e-05,
71
- "loss": 1.0863,
72
  "step": 900
73
  },
74
  {
75
- "epoch": 6.45,
76
- "learning_rate": 7.97624e-05,
77
- "loss": 0.9826,
78
  "step": 1000
79
  },
80
  {
81
- "epoch": 6.45,
82
- "eval_cer": 0.05135273515182096,
83
- "eval_loss": 0.19702914357185364,
84
- "eval_runtime": 195.781,
85
- "eval_samples_per_second": 29.635,
86
- "eval_steps_per_second": 0.465,
87
- "eval_wer": 0.26877153239888624,
88
  "step": 1000
89
  },
90
  {
91
- "epoch": 7.1,
92
- "learning_rate": 8e-05,
93
- "loss": 0.9708,
94
  "step": 1100
95
  },
96
  {
97
- "epoch": 7.74,
98
- "learning_rate": 8e-05,
99
- "loss": 0.917,
100
  "step": 1200
101
  },
102
  {
103
- "epoch": 8.38,
104
- "learning_rate": 8e-05,
105
- "loss": 0.888,
106
  "step": 1300
107
  },
108
  {
109
- "epoch": 9.03,
110
- "learning_rate": 8e-05,
111
- "loss": 0.9048,
112
  "step": 1400
113
  },
114
  {
115
- "epoch": 9.67,
116
- "learning_rate": 8e-05,
117
- "loss": 0.8628,
118
  "step": 1500
119
  },
120
  {
121
- "epoch": 9.67,
122
- "eval_cer": 0.04743848505535603,
123
- "eval_loss": 0.16490551829338074,
124
- "eval_runtime": 196.0899,
125
- "eval_samples_per_second": 29.588,
126
- "eval_steps_per_second": 0.464,
127
- "eval_wer": 0.24850158100901412,
128
  "step": 1500
129
  },
130
  {
131
- "epoch": 10.32,
132
- "learning_rate": 8e-05,
133
- "loss": 0.8616,
134
  "step": 1600
135
  },
136
  {
137
- "epoch": 10.96,
138
- "learning_rate": 8e-05,
139
- "loss": 0.8517,
140
  "step": 1700
141
  },
142
  {
143
- "epoch": 11.61,
144
- "learning_rate": 8e-05,
145
- "loss": 0.8455,
146
  "step": 1800
147
  },
148
  {
149
- "epoch": 12.26,
150
- "learning_rate": 8e-05,
151
- "loss": 0.8436,
152
  "step": 1900
153
  },
154
  {
155
- "epoch": 12.9,
156
- "learning_rate": 8e-05,
157
- "loss": 0.8348,
158
  "step": 2000
159
  },
160
  {
161
- "epoch": 12.9,
162
- "eval_cer": 0.046703867501706686,
163
- "eval_loss": 0.16045768558979034,
164
- "eval_runtime": 196.1747,
165
- "eval_samples_per_second": 29.576,
166
- "eval_steps_per_second": 0.464,
167
- "eval_wer": 0.24604747746472228,
168
  "step": 2000
169
  },
170
  {
171
- "epoch": 13.55,
172
- "learning_rate": 8e-05,
173
- "loss": 0.844,
174
  "step": 2100
175
  },
176
  {
177
- "epoch": 14.19,
178
- "learning_rate": 8e-05,
179
- "loss": 0.8369,
180
  "step": 2200
181
  },
182
  {
183
- "epoch": 14.83,
184
- "learning_rate": 8e-05,
185
- "loss": 0.8241,
186
  "step": 2300
187
  },
188
  {
189
- "epoch": 15.48,
190
- "learning_rate": 8e-05,
191
- "loss": 0.8235,
192
  "step": 2400
193
  },
194
  {
195
- "epoch": 16.13,
196
- "learning_rate": 8e-05,
197
- "loss": 0.8186,
198
  "step": 2500
199
  },
200
  {
201
- "epoch": 16.13,
202
- "eval_cer": 0.04690421774361105,
203
- "eval_loss": 0.1608021855354309,
204
- "eval_runtime": 196.173,
205
- "eval_samples_per_second": 29.576,
206
- "eval_steps_per_second": 0.464,
207
- "eval_wer": 0.24692057199490303,
208
  "step": 2500
209
  },
210
  {
211
- "epoch": 16.77,
212
- "learning_rate": 8e-05,
213
- "loss": 0.8355,
214
  "step": 2600
215
  },
216
  {
217
- "epoch": 17.42,
218
- "learning_rate": 8e-05,
219
- "loss": 0.8157,
220
  "step": 2700
221
  },
222
  {
223
- "epoch": 18.06,
224
- "learning_rate": 8e-05,
225
- "loss": 0.8175,
226
  "step": 2800
227
  },
228
  {
229
- "epoch": 18.71,
230
- "learning_rate": 8e-05,
231
- "loss": 0.801,
232
  "step": 2900
233
  },
234
  {
235
- "epoch": 19.35,
236
- "learning_rate": 8e-05,
237
- "loss": 0.8011,
238
  "step": 3000
239
  },
240
  {
241
- "epoch": 19.35,
242
- "eval_cer": 0.046789201863999284,
243
- "eval_loss": 0.1620311141014099,
244
- "eval_runtime": 197.3892,
245
- "eval_samples_per_second": 29.394,
246
- "eval_steps_per_second": 0.461,
247
- "eval_wer": 0.24118646467506724,
248
  "step": 3000
249
  },
250
  {
251
- "epoch": 19.99,
252
- "learning_rate": 8e-05,
253
- "loss": 0.7888,
254
  "step": 3100
255
  },
256
  {
257
- "epoch": 20.64,
258
- "learning_rate": 8e-05,
259
- "loss": 0.8008,
260
  "step": 3200
261
  },
262
  {
263
- "epoch": 21.29,
264
- "learning_rate": 8e-05,
265
- "loss": 0.8197,
266
  "step": 3300
267
  },
268
  {
269
- "epoch": 21.93,
270
- "learning_rate": 8e-05,
271
- "loss": 0.8065,
272
  "step": 3400
273
  },
274
  {
275
- "epoch": 22.58,
276
- "learning_rate": 8e-05,
277
- "loss": 0.807,
278
  "step": 3500
279
  },
280
  {
281
- "epoch": 22.58,
282
- "eval_cer": 0.049805586061559465,
283
- "eval_loss": 0.17369326949119568,
284
- "eval_runtime": 196.0869,
285
- "eval_samples_per_second": 29.589,
286
- "eval_steps_per_second": 0.464,
287
- "eval_wer": 0.252395110670631,
288
  "step": 3500
289
  },
290
  {
291
- "epoch": 23.22,
292
- "learning_rate": 8e-05,
293
- "loss": 0.8045,
294
  "step": 3600
295
  },
296
  {
297
- "epoch": 23.87,
298
- "learning_rate": 8e-05,
299
- "loss": 0.7925,
300
  "step": 3700
301
  },
302
  {
303
- "epoch": 24.51,
304
- "learning_rate": 8e-05,
305
- "loss": 0.8046,
306
  "step": 3800
307
  },
308
  {
309
- "epoch": 25.16,
310
- "learning_rate": 8e-05,
311
- "loss": 0.8102,
312
  "step": 3900
313
  },
314
  {
315
- "epoch": 25.8,
316
- "learning_rate": 8e-05,
317
- "loss": 0.7758,
318
  "step": 4000
319
  },
320
  {
321
- "epoch": 25.8,
322
- "eval_cer": 0.04979074530289988,
323
- "eval_loss": 0.1708839237689972,
324
- "eval_runtime": 196.4196,
325
- "eval_samples_per_second": 29.539,
326
- "eval_steps_per_second": 0.463,
327
- "eval_wer": 0.2535985652933126,
328
  "step": 4000
329
  },
330
  {
331
- "epoch": 26.45,
332
- "learning_rate": 8e-05,
333
- "loss": 0.7968,
334
  "step": 4100
335
  },
336
  {
337
- "epoch": 27.1,
338
- "learning_rate": 8e-05,
339
- "loss": 0.7904,
340
  "step": 4200
341
  },
342
  {
343
- "epoch": 27.74,
344
- "learning_rate": 8e-05,
345
- "loss": 0.8001,
346
  "step": 4300
347
  },
348
  {
349
- "epoch": 28.38,
350
- "learning_rate": 8e-05,
351
- "loss": 0.7869,
352
  "step": 4400
353
  },
354
  {
355
- "epoch": 29.03,
356
- "learning_rate": 8e-05,
357
- "loss": 0.7923,
358
  "step": 4500
359
  },
360
  {
361
- "epoch": 29.03,
362
- "eval_cer": 0.04736799145172301,
363
- "eval_loss": 0.16446976363658905,
364
- "eval_runtime": 196.4759,
365
- "eval_samples_per_second": 29.53,
366
- "eval_steps_per_second": 0.463,
367
- "eval_wer": 0.24356977677096606,
368
  "step": 4500
369
  },
370
  {
371
- "epoch": 29.67,
372
- "learning_rate": 8e-05,
373
- "loss": 0.772,
374
  "step": 4600
375
  },
376
  {
377
- "epoch": 30.32,
378
- "learning_rate": 8e-05,
379
- "loss": 0.7702,
380
  "step": 4700
381
  },
382
  {
383
- "epoch": 30.96,
384
- "learning_rate": 8e-05,
385
- "loss": 0.7797,
386
  "step": 4800
387
  },
388
  {
389
- "epoch": 31.61,
390
- "learning_rate": 8e-05,
391
- "loss": 0.7759,
392
  "step": 4900
393
  },
394
  {
395
- "epoch": 32.26,
396
- "learning_rate": 8e-05,
397
- "loss": 0.7717,
398
  "step": 5000
399
  },
400
  {
401
- "epoch": 32.26,
402
- "eval_cer": 0.052350776171677896,
403
- "eval_loss": 0.1811000257730484,
404
- "eval_runtime": 196.7068,
405
- "eval_samples_per_second": 29.496,
406
- "eval_steps_per_second": 0.463,
407
- "eval_wer": 0.26355656236726605,
408
  "step": 5000
409
  },
410
  {
411
- "epoch": 32.9,
412
- "learning_rate": 7.852560000000001e-05,
413
- "loss": 0.7608,
414
  "step": 5100
415
  },
416
  {
417
- "epoch": 33.55,
418
- "learning_rate": 7.700560000000001e-05,
419
- "loss": 0.763,
420
  "step": 5200
421
  },
422
  {
423
- "epoch": 34.19,
424
- "learning_rate": 7.54856e-05,
425
- "loss": 0.7712,
426
  "step": 5300
427
  },
428
  {
429
- "epoch": 34.83,
430
- "learning_rate": 7.39656e-05,
431
- "loss": 0.7478,
432
  "step": 5400
433
  },
434
  {
435
- "epoch": 35.48,
436
- "learning_rate": 7.24456e-05,
437
- "loss": 0.7447,
438
  "step": 5500
439
  },
440
  {
441
- "epoch": 35.48,
442
- "eval_cer": 0.04679662224332908,
443
- "eval_loss": 0.16353937983512878,
444
- "eval_runtime": 196.1767,
445
- "eval_samples_per_second": 29.575,
446
- "eval_steps_per_second": 0.464,
447
- "eval_wer": 0.2404785501911369,
448
  "step": 5500
449
  },
450
  {
451
- "epoch": 36.13,
452
- "learning_rate": 7.09256e-05,
453
- "loss": 0.7544,
454
  "step": 5600
455
  },
456
  {
457
- "epoch": 36.77,
458
- "learning_rate": 6.94056e-05,
459
- "loss": 0.7438,
460
  "step": 5700
461
  },
462
  {
463
- "epoch": 37.42,
464
- "learning_rate": 6.79008e-05,
465
- "loss": 0.742,
466
  "step": 5800
467
  },
468
  {
469
- "epoch": 38.06,
470
- "learning_rate": 6.638080000000001e-05,
471
- "loss": 0.7441,
472
  "step": 5900
473
  },
474
  {
475
- "epoch": 38.71,
476
- "learning_rate": 6.486080000000001e-05,
477
- "loss": 0.7267,
478
  "step": 6000
479
  },
480
  {
481
- "epoch": 38.71,
482
- "eval_cer": 0.046236383603929836,
483
- "eval_loss": 0.15783575177192688,
484
- "eval_runtime": 197.1092,
485
- "eval_samples_per_second": 29.435,
486
- "eval_steps_per_second": 0.462,
487
- "eval_wer": 0.23542876020576714,
488
  "step": 6000
489
  },
490
  {
491
- "epoch": 39.35,
492
- "learning_rate": 6.33408e-05,
493
- "loss": 0.7112,
494
  "step": 6100
495
  },
496
  {
497
- "epoch": 39.99,
498
- "learning_rate": 6.18208e-05,
499
- "loss": 0.7052,
500
  "step": 6200
501
  },
502
  {
503
- "epoch": 40.64,
504
- "learning_rate": 6.0300800000000004e-05,
505
- "loss": 0.7105,
506
  "step": 6300
507
  },
508
  {
509
- "epoch": 41.29,
510
- "learning_rate": 5.878080000000001e-05,
511
- "loss": 0.7107,
512
  "step": 6400
513
  },
514
  {
515
- "epoch": 41.93,
516
- "learning_rate": 5.72608e-05,
517
- "loss": 0.7046,
518
  "step": 6500
519
  },
520
  {
521
- "epoch": 41.93,
522
- "eval_cer": 0.044429521237125645,
523
- "eval_loss": 0.15552951395511627,
524
- "eval_runtime": 196.7222,
525
- "eval_samples_per_second": 29.493,
526
- "eval_steps_per_second": 0.463,
527
- "eval_wer": 0.22957666713860966,
528
  "step": 6500
529
  },
530
  {
531
- "epoch": 42.58,
532
- "learning_rate": 5.574080000000001e-05,
533
- "loss": 0.7035,
534
  "step": 6600
535
  },
536
  {
537
- "epoch": 43.22,
538
- "learning_rate": 5.422080000000001e-05,
539
- "loss": 0.6967,
540
  "step": 6700
541
  },
542
  {
543
- "epoch": 43.87,
544
- "learning_rate": 5.271600000000001e-05,
545
- "loss": 0.687,
546
  "step": 6800
547
  },
548
  {
549
- "epoch": 44.51,
550
- "learning_rate": 5.1196e-05,
551
- "loss": 0.6875,
552
  "step": 6900
553
  },
554
  {
555
- "epoch": 45.16,
556
- "learning_rate": 4.967600000000001e-05,
557
- "loss": 0.6896,
558
  "step": 7000
559
  },
560
  {
561
- "epoch": 45.16,
562
- "eval_cer": 0.043932355822029624,
563
- "eval_loss": 0.15479956567287445,
564
- "eval_runtime": 196.5953,
565
- "eval_samples_per_second": 29.512,
566
- "eval_steps_per_second": 0.463,
567
- "eval_wer": 0.2271697578932465,
568
  "step": 7000
569
  },
570
  {
571
- "epoch": 45.8,
572
- "learning_rate": 4.8156000000000004e-05,
573
- "loss": 0.6722,
574
  "step": 7100
575
  },
576
  {
577
- "epoch": 46.45,
578
- "learning_rate": 4.663600000000001e-05,
579
- "loss": 0.6816,
580
  "step": 7200
581
  },
582
  {
583
- "epoch": 47.1,
584
- "learning_rate": 4.5116000000000006e-05,
585
- "loss": 0.6658,
586
  "step": 7300
587
  },
588
  {
589
- "epoch": 47.74,
590
- "learning_rate": 4.359600000000001e-05,
591
- "loss": 0.6507,
592
  "step": 7400
593
  },
594
  {
595
- "epoch": 48.38,
596
- "learning_rate": 4.207600000000001e-05,
597
- "loss": 0.6575,
598
  "step": 7500
599
  },
600
  {
601
- "epoch": 48.38,
602
- "eval_cer": 0.03991422041494761,
603
- "eval_loss": 0.14319901168346405,
604
- "eval_runtime": 196.3465,
605
- "eval_samples_per_second": 29.55,
606
- "eval_steps_per_second": 0.463,
607
- "eval_wer": 0.2096370758412384,
608
  "step": 7500
609
  },
610
  {
611
- "epoch": 49.03,
612
- "learning_rate": 4.0556e-05,
613
- "loss": 0.6524,
614
  "step": 7600
615
  },
616
  {
617
- "epoch": 49.67,
618
- "learning_rate": 3.9036000000000004e-05,
619
- "loss": 0.6336,
620
  "step": 7700
621
  },
622
  {
623
- "epoch": 50.32,
624
- "learning_rate": 3.751600000000001e-05,
625
- "loss": 0.6335,
626
  "step": 7800
627
  },
628
  {
629
- "epoch": 50.96,
630
- "learning_rate": 3.5996000000000006e-05,
631
- "loss": 0.6356,
632
  "step": 7900
633
  },
634
  {
635
- "epoch": 51.61,
636
- "learning_rate": 3.447600000000001e-05,
637
- "loss": 0.6264,
638
  "step": 8000
639
  },
640
  {
641
- "epoch": 51.61,
642
- "eval_cer": 0.039750972069692206,
643
- "eval_loss": 0.14660798013210297,
644
- "eval_runtime": 197.4423,
645
- "eval_samples_per_second": 29.386,
646
- "eval_steps_per_second": 0.461,
647
- "eval_wer": 0.20560196328283542,
648
  "step": 8000
649
  },
650
  {
651
- "epoch": 52.26,
652
- "learning_rate": 3.295600000000001e-05,
653
- "loss": 0.6151,
654
  "step": 8100
655
  },
656
  {
657
- "epoch": 52.9,
658
- "learning_rate": 3.1436e-05,
659
- "loss": 0.6138,
660
  "step": 8200
661
  },
662
  {
663
- "epoch": 53.55,
664
- "learning_rate": 2.9916000000000003e-05,
665
- "loss": 0.6066,
666
  "step": 8300
667
  },
668
  {
669
- "epoch": 54.19,
670
- "learning_rate": 2.839600000000001e-05,
671
- "loss": 0.6091,
672
  "step": 8400
673
  },
674
  {
675
- "epoch": 54.83,
676
- "learning_rate": 2.687600000000001e-05,
677
- "loss": 0.589,
678
  "step": 8500
679
  },
680
  {
681
- "epoch": 54.83,
682
- "eval_cer": 0.03713528835594076,
683
- "eval_loss": 0.1351083666086197,
684
- "eval_runtime": 196.1992,
685
- "eval_samples_per_second": 29.572,
686
- "eval_steps_per_second": 0.464,
687
- "eval_wer": 0.19427533153994997,
688
  "step": 8500
689
  },
690
  {
691
- "epoch": 55.48,
692
- "learning_rate": 2.5356000000000006e-05,
693
- "loss": 0.59,
694
  "step": 8600
695
  },
696
  {
697
- "epoch": 56.13,
698
- "learning_rate": 2.3836000000000007e-05,
699
- "loss": 0.5954,
700
  "step": 8700
701
  },
702
  {
703
- "epoch": 56.77,
704
- "learning_rate": 2.2316000000000005e-05,
705
- "loss": 0.5886,
706
  "step": 8800
707
  },
708
  {
709
- "epoch": 57.42,
710
- "learning_rate": 2.0796000000000002e-05,
711
- "loss": 0.5923,
712
  "step": 8900
713
  },
714
  {
715
- "epoch": 58.06,
716
- "learning_rate": 1.927600000000001e-05,
717
- "loss": 0.573,
718
  "step": 9000
719
  },
720
  {
721
- "epoch": 58.06,
722
- "eval_cer": 0.03653794781989255,
723
- "eval_loss": 0.13869842886924744,
724
- "eval_runtime": 197.6459,
725
- "eval_samples_per_second": 29.356,
726
- "eval_steps_per_second": 0.46,
727
- "eval_wer": 0.19342583415923356,
728
  "step": 9000
729
  },
730
  {
731
- "epoch": 58.71,
732
- "learning_rate": 1.7756000000000008e-05,
733
- "loss": 0.5681,
734
  "step": 9100
735
  },
736
  {
737
- "epoch": 59.35,
738
- "learning_rate": 1.623600000000001e-05,
739
- "loss": 0.5749,
740
  "step": 9200
741
  },
742
  {
743
- "epoch": 59.99,
744
- "learning_rate": 1.4716000000000006e-05,
745
- "loss": 0.5649,
746
  "step": 9300
747
  },
748
  {
749
- "epoch": 60.64,
750
- "learning_rate": 1.3196000000000004e-05,
751
- "loss": 0.5649,
752
  "step": 9400
753
  },
754
  {
755
- "epoch": 61.29,
756
- "learning_rate": 1.1676000000000003e-05,
757
- "loss": 0.5537,
758
  "step": 9500
759
  },
760
  {
761
- "epoch": 61.29,
762
- "eval_cer": 0.035276483333828025,
763
- "eval_loss": 0.132797509431839,
764
- "eval_runtime": 197.1347,
765
- "eval_samples_per_second": 29.432,
766
- "eval_steps_per_second": 0.462,
767
- "eval_wer": 0.18825805842654209,
768
  "step": 9500
769
  },
770
  {
771
- "epoch": 61.93,
772
- "learning_rate": 1.015600000000001e-05,
773
- "loss": 0.5551,
774
  "step": 9600
775
  },
776
  {
777
- "epoch": 62.58,
778
- "learning_rate": 8.636000000000008e-06,
779
- "loss": 0.5563,
780
  "step": 9700
781
  },
782
  {
783
- "epoch": 63.22,
784
- "learning_rate": 7.116000000000008e-06,
785
- "loss": 0.5469,
786
  "step": 9800
787
  },
788
  {
789
- "epoch": 63.87,
790
- "learning_rate": 5.596000000000006e-06,
791
- "loss": 0.5431,
792
  "step": 9900
793
  },
794
  {
795
- "epoch": 64.51,
796
- "learning_rate": 4.076000000000005e-06,
797
- "loss": 0.544,
798
  "step": 10000
799
  },
800
  {
801
- "epoch": 64.51,
802
- "eval_cer": 0.034204238520673176,
803
- "eval_loss": 0.12852737307548523,
804
- "eval_runtime": 197.3431,
805
- "eval_samples_per_second": 29.401,
806
- "eval_steps_per_second": 0.461,
807
- "eval_wer": 0.18207560526688377,
808
  "step": 10000
809
  },
810
  {
811
- "epoch": 64.51,
812
- "step": 10000,
813
- "total_flos": 6.715892353150186e+20,
814
- "train_loss": 0.5247637950897217,
815
- "train_runtime": 62085.4852,
816
- "train_samples_per_second": 20.617,
817
- "train_steps_per_second": 0.161
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
818
  }
819
  ],
820
- "max_steps": 10000,
821
- "num_train_epochs": 65,
822
- "total_flos": 6.715892353150186e+20,
823
  "trial_name": null,
824
  "trial_params": null
825
  }
 
1
  {
2
+ "best_metric": 0.10924588888883591,
3
+ "best_model_checkpoint": "./checkpoint-12000",
4
+ "epoch": 38.58520900321543,
5
+ "global_step": 12000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
+ "epoch": 0.32,
12
+ "learning_rate": 4.50125e-06,
13
+ "loss": 8.0884,
14
  "step": 100
15
  },
16
  {
17
+ "epoch": 0.64,
18
+ "learning_rate": 8.62625e-06,
19
+ "loss": 3.2246,
20
  "step": 200
21
  },
22
  {
23
+ "epoch": 0.96,
24
+ "learning_rate": 1.2751250000000001e-05,
25
+ "loss": 3.1607,
26
  "step": 300
27
  },
28
  {
29
+ "epoch": 1.29,
30
+ "learning_rate": 1.6876250000000003e-05,
31
+ "loss": 2.3964,
32
  "step": 400
33
  },
34
  {
35
+ "epoch": 1.61,
36
+ "learning_rate": 2.100125e-05,
37
+ "loss": 1.7005,
38
  "step": 500
39
  },
40
  {
41
+ "epoch": 1.61,
42
+ "eval_cer": 0.11636638864978778,
43
+ "eval_loss": 0.4082379639148712,
44
+ "eval_runtime": 199.3377,
45
+ "eval_samples_per_second": 29.106,
46
+ "eval_steps_per_second": 0.457,
47
+ "eval_wer": 0.5583793477747888,
48
  "step": 500
49
  },
50
  {
51
+ "epoch": 1.93,
52
+ "learning_rate": 2.512625e-05,
53
+ "loss": 1.4874,
54
  "step": 600
55
  },
56
  {
57
+ "epoch": 2.25,
58
+ "learning_rate": 2.9251250000000002e-05,
59
+ "loss": 1.3431,
60
  "step": 700
61
  },
62
  {
63
+ "epoch": 2.57,
64
+ "learning_rate": 3.3376250000000004e-05,
65
+ "loss": 1.2316,
66
  "step": 800
67
  },
68
  {
69
+ "epoch": 2.89,
70
+ "learning_rate": 3.750125e-05,
71
+ "loss": 1.187,
72
  "step": 900
73
  },
74
  {
75
+ "epoch": 3.22,
76
+ "learning_rate": 4.162625e-05,
77
+ "loss": 1.1555,
78
  "step": 1000
79
  },
80
  {
81
+ "epoch": 3.22,
82
+ "eval_cer": 0.05566026535276483,
83
+ "eval_loss": 0.2020130306482315,
84
+ "eval_runtime": 199.2116,
85
+ "eval_samples_per_second": 29.125,
86
+ "eval_steps_per_second": 0.457,
87
+ "eval_wer": 0.29534192269573833,
88
  "step": 1000
89
  },
90
  {
91
+ "epoch": 3.54,
92
+ "learning_rate": 4.575125e-05,
93
+ "loss": 1.1286,
94
  "step": 1100
95
  },
96
  {
97
+ "epoch": 3.86,
98
+ "learning_rate": 4.9876250000000005e-05,
99
+ "loss": 1.1143,
100
  "step": 1200
101
  },
102
  {
103
+ "epoch": 4.18,
104
+ "learning_rate": 5e-05,
105
+ "loss": 1.1067,
106
  "step": 1300
107
  },
108
  {
109
+ "epoch": 4.5,
110
+ "learning_rate": 5e-05,
111
+ "loss": 1.0992,
112
  "step": 1400
113
  },
114
  {
115
+ "epoch": 4.82,
116
+ "learning_rate": 5e-05,
117
+ "loss": 1.0927,
118
  "step": 1500
119
  },
120
  {
121
+ "epoch": 4.82,
122
+ "eval_cer": 0.04799130331542548,
123
+ "eval_loss": 0.1707664430141449,
124
+ "eval_runtime": 197.8453,
125
+ "eval_samples_per_second": 29.326,
126
+ "eval_steps_per_second": 0.46,
127
+ "eval_wer": 0.25843598093350323,
128
  "step": 1500
129
  },
130
  {
131
+ "epoch": 5.14,
132
+ "learning_rate": 5e-05,
133
+ "loss": 1.0907,
134
  "step": 1600
135
  },
136
  {
137
+ "epoch": 5.47,
138
+ "learning_rate": 5e-05,
139
+ "loss": 1.0765,
140
  "step": 1700
141
  },
142
  {
143
+ "epoch": 5.79,
144
+ "learning_rate": 5e-05,
145
+ "loss": 1.0693,
146
  "step": 1800
147
  },
148
  {
149
+ "epoch": 6.11,
150
+ "learning_rate": 5e-05,
151
+ "loss": 1.0547,
152
  "step": 1900
153
  },
154
  {
155
+ "epoch": 6.43,
156
+ "learning_rate": 5e-05,
157
+ "loss": 1.0707,
158
  "step": 2000
159
  },
160
  {
161
+ "epoch": 6.43,
162
+ "eval_cer": 0.04497120892820041,
163
+ "eval_loss": 0.15630319714546204,
164
+ "eval_runtime": 197.159,
165
+ "eval_samples_per_second": 29.428,
166
+ "eval_steps_per_second": 0.462,
167
+ "eval_wer": 0.24054934163952996,
168
  "step": 2000
169
  },
170
  {
171
+ "epoch": 6.75,
172
+ "learning_rate": 5e-05,
173
+ "loss": 1.0647,
174
  "step": 2100
175
  },
176
  {
177
+ "epoch": 7.07,
178
+ "learning_rate": 5e-05,
179
+ "loss": 1.054,
180
  "step": 2200
181
  },
182
  {
183
+ "epoch": 7.4,
184
+ "learning_rate": 5e-05,
185
+ "loss": 1.0478,
186
  "step": 2300
187
  },
188
  {
189
+ "epoch": 7.72,
190
+ "learning_rate": 5e-05,
191
+ "loss": 1.0611,
192
  "step": 2400
193
  },
194
  {
195
+ "epoch": 8.04,
196
+ "learning_rate": 5e-05,
197
+ "loss": 1.0728,
198
  "step": 2500
199
  },
200
  {
201
+ "epoch": 8.04,
202
+ "eval_cer": 0.04629574663856816,
203
+ "eval_loss": 0.16203930974006653,
204
+ "eval_runtime": 196.804,
205
+ "eval_samples_per_second": 29.481,
206
+ "eval_steps_per_second": 0.462,
207
+ "eval_wer": 0.2442304969559677,
208
  "step": 2500
209
  },
210
  {
211
+ "epoch": 8.36,
212
+ "learning_rate": 5e-05,
213
+ "loss": 1.0563,
214
  "step": 2600
215
  },
216
  {
217
+ "epoch": 8.68,
218
+ "learning_rate": 5e-05,
219
+ "loss": 1.0404,
220
  "step": 2700
221
  },
222
  {
223
+ "epoch": 9.0,
224
+ "learning_rate": 5e-05,
225
+ "loss": 1.0791,
226
  "step": 2800
227
  },
228
  {
229
+ "epoch": 9.32,
230
+ "learning_rate": 5e-05,
231
+ "loss": 1.0535,
232
  "step": 2900
233
  },
234
  {
235
+ "epoch": 9.65,
236
+ "learning_rate": 5e-05,
237
+ "loss": 1.0268,
238
  "step": 3000
239
  },
240
  {
241
+ "epoch": 9.65,
242
+ "eval_cer": 0.04575776913715829,
243
+ "eval_loss": 0.15875375270843506,
244
+ "eval_runtime": 201.4769,
245
+ "eval_samples_per_second": 28.797,
246
+ "eval_steps_per_second": 0.452,
247
+ "eval_wer": 0.2377884751522016,
248
  "step": 3000
249
  },
250
  {
251
+ "epoch": 9.97,
252
+ "learning_rate": 5e-05,
253
+ "loss": 1.0322,
254
  "step": 3100
255
  },
256
  {
257
+ "epoch": 10.29,
258
+ "learning_rate": 5e-05,
259
+ "loss": 1.0208,
260
  "step": 3200
261
  },
262
  {
263
+ "epoch": 10.61,
264
+ "learning_rate": 5e-05,
265
+ "loss": 1.0172,
266
  "step": 3300
267
  },
268
  {
269
+ "epoch": 10.93,
270
+ "learning_rate": 5e-05,
271
+ "loss": 1.019,
272
  "step": 3400
273
  },
274
  {
275
+ "epoch": 11.25,
276
+ "learning_rate": 5e-05,
277
+ "loss": 1.0328,
278
  "step": 3500
279
  },
280
  {
281
+ "epoch": 11.25,
282
+ "eval_cer": 0.04419206909857232,
283
+ "eval_loss": 0.14661966264247894,
284
+ "eval_runtime": 196.9894,
285
+ "eval_samples_per_second": 29.453,
286
+ "eval_steps_per_second": 0.462,
287
+ "eval_wer": 0.23516919156165936,
288
  "step": 3500
289
  },
290
  {
291
+ "epoch": 11.58,
292
+ "learning_rate": 5e-05,
293
+ "loss": 1.0153,
294
  "step": 3600
295
  },
296
  {
297
+ "epoch": 11.9,
298
+ "learning_rate": 5e-05,
299
+ "loss": 1.0206,
300
  "step": 3700
301
  },
302
  {
303
+ "epoch": 12.22,
304
+ "learning_rate": 5e-05,
305
+ "loss": 1.0168,
306
  "step": 3800
307
  },
308
  {
309
+ "epoch": 12.54,
310
+ "learning_rate": 5e-05,
311
+ "loss": 1.0269,
312
  "step": 3900
313
  },
314
  {
315
+ "epoch": 12.86,
316
+ "learning_rate": 5e-05,
317
+ "loss": 1.0249,
318
  "step": 4000
319
  },
320
  {
321
+ "epoch": 12.86,
322
+ "eval_cer": 0.04486361342791843,
323
+ "eval_loss": 0.15519459545612335,
324
+ "eval_runtime": 197.5966,
325
+ "eval_samples_per_second": 29.363,
326
+ "eval_steps_per_second": 0.461,
327
+ "eval_wer": 0.23413091698522817,
328
  "step": 4000
329
  },
330
  {
331
+ "epoch": 13.18,
332
+ "learning_rate": 5e-05,
333
+ "loss": 1.022,
334
  "step": 4100
335
  },
336
  {
337
+ "epoch": 13.5,
338
+ "learning_rate": 5e-05,
339
+ "loss": 1.0219,
340
  "step": 4200
341
  },
342
  {
343
+ "epoch": 13.83,
344
+ "learning_rate": 5e-05,
345
+ "loss": 1.0203,
346
  "step": 4300
347
  },
348
  {
349
+ "epoch": 14.15,
350
+ "learning_rate": 5e-05,
351
+ "loss": 1.0171,
352
  "step": 4400
353
  },
354
  {
355
+ "epoch": 14.47,
356
+ "learning_rate": 5e-05,
357
+ "loss": 1.016,
358
  "step": 4500
359
  },
360
  {
361
+ "epoch": 14.47,
362
+ "eval_cer": 0.047286367279095305,
363
+ "eval_loss": 0.16016805171966553,
364
+ "eval_runtime": 197.4133,
365
+ "eval_samples_per_second": 29.39,
366
+ "eval_steps_per_second": 0.461,
367
+ "eval_wer": 0.2435461796215017,
368
  "step": 4500
369
  },
370
  {
371
+ "epoch": 14.79,
372
+ "learning_rate": 5e-05,
373
+ "loss": 1.0233,
374
  "step": 4600
375
  },
376
  {
377
+ "epoch": 15.11,
378
+ "learning_rate": 5e-05,
379
+ "loss": 1.0139,
380
  "step": 4700
381
  },
382
  {
383
+ "epoch": 15.43,
384
+ "learning_rate": 5e-05,
385
+ "loss": 1.0252,
386
  "step": 4800
387
  },
388
  {
389
+ "epoch": 15.76,
390
+ "learning_rate": 4.936666666666667e-05,
391
+ "loss": 1.0305,
392
  "step": 4900
393
  },
394
  {
395
+ "epoch": 16.08,
396
+ "learning_rate": 4.870694444444445e-05,
397
+ "loss": 1.0164,
398
  "step": 5000
399
  },
400
  {
401
+ "epoch": 16.08,
402
+ "eval_cer": 0.044392419340476684,
403
+ "eval_loss": 0.14910832047462463,
404
+ "eval_runtime": 205.8325,
405
+ "eval_samples_per_second": 28.188,
406
+ "eval_steps_per_second": 0.442,
407
+ "eval_wer": 0.23372976544433433,
408
  "step": 5000
409
  },
410
  {
411
+ "epoch": 16.4,
412
+ "learning_rate": 4.804722222222223e-05,
413
+ "loss": 1.0029,
414
  "step": 5100
415
  },
416
  {
417
+ "epoch": 16.72,
418
+ "learning_rate": 4.73875e-05,
419
+ "loss": 0.9924,
420
  "step": 5200
421
  },
422
  {
423
+ "epoch": 17.04,
424
+ "learning_rate": 4.672777777777778e-05,
425
+ "loss": 1.0058,
426
  "step": 5300
427
  },
428
  {
429
+ "epoch": 17.36,
430
+ "learning_rate": 4.606805555555556e-05,
431
+ "loss": 0.996,
432
  "step": 5400
433
  },
434
  {
435
+ "epoch": 17.68,
436
+ "learning_rate": 4.540833333333334e-05,
437
+ "loss": 0.9935,
438
  "step": 5500
439
  },
440
  {
441
+ "epoch": 17.68,
442
+ "eval_cer": 0.045754058947493396,
443
+ "eval_loss": 0.15390604734420776,
444
+ "eval_runtime": 206.7044,
445
+ "eval_samples_per_second": 28.069,
446
+ "eval_steps_per_second": 0.44,
447
+ "eval_wer": 0.23729293501345036,
448
  "step": 5500
449
  },
450
  {
451
+ "epoch": 18.01,
452
+ "learning_rate": 4.4748611111111116e-05,
453
+ "loss": 0.9993,
454
  "step": 5600
455
  },
456
  {
457
+ "epoch": 18.33,
458
+ "learning_rate": 4.408888888888889e-05,
459
+ "loss": 0.983,
460
  "step": 5700
461
  },
462
  {
463
+ "epoch": 18.65,
464
+ "learning_rate": 4.342916666666667e-05,
465
+ "loss": 0.9794,
466
  "step": 5800
467
  },
468
  {
469
+ "epoch": 18.97,
470
+ "learning_rate": 4.2769444444444447e-05,
471
+ "loss": 0.9719,
472
  "step": 5900
473
  },
474
  {
475
+ "epoch": 19.29,
476
+ "learning_rate": 4.2109722222222226e-05,
477
+ "loss": 0.9626,
478
  "step": 6000
479
  },
480
  {
481
+ "epoch": 19.29,
482
+ "eval_cer": 0.04342777002760381,
483
+ "eval_loss": 0.1458132266998291,
484
+ "eval_runtime": 201.2355,
485
+ "eval_samples_per_second": 28.832,
486
+ "eval_steps_per_second": 0.452,
487
+ "eval_wer": 0.2305441502666478,
488
  "step": 6000
489
  },
490
  {
491
+ "epoch": 19.61,
492
+ "learning_rate": 4.145e-05,
493
+ "loss": 0.9542,
494
  "step": 6100
495
  },
496
  {
497
+ "epoch": 19.94,
498
+ "learning_rate": 4.079027777777778e-05,
499
+ "loss": 0.978,
500
  "step": 6200
501
  },
502
  {
503
+ "epoch": 20.26,
504
+ "learning_rate": 4.013055555555556e-05,
505
+ "loss": 0.9536,
506
  "step": 6300
507
  },
508
  {
509
+ "epoch": 20.58,
510
+ "learning_rate": 3.9470833333333335e-05,
511
+ "loss": 0.9627,
512
  "step": 6400
513
  },
514
  {
515
+ "epoch": 20.9,
516
+ "learning_rate": 3.8811111111111114e-05,
517
+ "loss": 0.9505,
518
  "step": 6500
519
  },
520
  {
521
+ "epoch": 20.9,
522
+ "eval_cer": 0.04073046214122466,
523
+ "eval_loss": 0.13684287667274475,
524
+ "eval_runtime": 202.0319,
525
+ "eval_samples_per_second": 28.718,
526
+ "eval_steps_per_second": 0.45,
527
+ "eval_wer": 0.21565434895464627,
528
  "step": 6500
529
  },
530
  {
531
+ "epoch": 21.22,
532
+ "learning_rate": 3.815138888888889e-05,
533
+ "loss": 0.9395,
534
  "step": 6600
535
  },
536
  {
537
+ "epoch": 21.54,
538
+ "learning_rate": 3.749166666666667e-05,
539
+ "loss": 0.9393,
540
  "step": 6700
541
  },
542
  {
543
+ "epoch": 21.86,
544
+ "learning_rate": 3.6831944444444444e-05,
545
+ "loss": 0.9541,
546
  "step": 6800
547
  },
548
  {
549
+ "epoch": 22.19,
550
+ "learning_rate": 3.6172222222222224e-05,
551
+ "loss": 0.9538,
552
  "step": 6900
553
  },
554
  {
555
+ "epoch": 22.51,
556
+ "learning_rate": 3.55125e-05,
557
+ "loss": 0.9389,
558
  "step": 7000
559
  },
560
  {
561
+ "epoch": 22.51,
562
+ "eval_cer": 0.042626369059986347,
563
+ "eval_loss": 0.14371351897716522,
564
+ "eval_runtime": 197.7954,
565
+ "eval_samples_per_second": 29.333,
566
+ "eval_steps_per_second": 0.46,
567
+ "eval_wer": 0.22306385388645053,
568
  "step": 7000
569
  },
570
  {
571
+ "epoch": 22.83,
572
+ "learning_rate": 3.485277777777778e-05,
573
+ "loss": 0.9429,
574
  "step": 7100
575
  },
576
  {
577
+ "epoch": 23.15,
578
+ "learning_rate": 3.419965277777778e-05,
579
+ "loss": 0.9407,
580
  "step": 7200
581
  },
582
  {
583
+ "epoch": 23.47,
584
+ "learning_rate": 3.353993055555556e-05,
585
+ "loss": 0.9224,
586
  "step": 7300
587
  },
588
  {
589
+ "epoch": 23.79,
590
+ "learning_rate": 3.288020833333334e-05,
591
+ "loss": 0.9197,
592
  "step": 7400
593
  },
594
  {
595
+ "epoch": 24.12,
596
+ "learning_rate": 3.2220486111111115e-05,
597
+ "loss": 0.9129,
598
  "step": 7500
599
  },
600
  {
601
+ "epoch": 24.12,
602
+ "eval_cer": 0.039372532723872845,
603
+ "eval_loss": 0.13133755326271057,
604
+ "eval_runtime": 209.4773,
605
+ "eval_samples_per_second": 27.698,
606
+ "eval_steps_per_second": 0.434,
607
+ "eval_wer": 0.20760772098730473,
608
  "step": 7500
609
  },
610
  {
611
+ "epoch": 24.44,
612
+ "learning_rate": 3.156076388888889e-05,
613
+ "loss": 0.9169,
614
  "step": 7600
615
  },
616
  {
617
+ "epoch": 24.76,
618
+ "learning_rate": 3.090763888888889e-05,
619
+ "loss": 0.9133,
620
  "step": 7700
621
  },
622
  {
623
+ "epoch": 25.08,
624
+ "learning_rate": 3.024791666666667e-05,
625
+ "loss": 0.9068,
626
  "step": 7800
627
  },
628
  {
629
+ "epoch": 25.4,
630
+ "learning_rate": 2.958819444444445e-05,
631
+ "loss": 0.9137,
632
  "step": 7900
633
  },
634
  {
635
+ "epoch": 25.72,
636
+ "learning_rate": 2.8928472222222224e-05,
637
+ "loss": 0.9118,
638
  "step": 8000
639
  },
640
  {
641
+ "epoch": 25.72,
642
+ "eval_cer": 0.03844869549731382,
643
+ "eval_loss": 0.12918178737163544,
644
+ "eval_runtime": 197.6149,
645
+ "eval_samples_per_second": 29.36,
646
+ "eval_steps_per_second": 0.46,
647
+ "eval_wer": 0.2040445514181887,
648
  "step": 8000
649
  },
650
  {
651
+ "epoch": 26.05,
652
+ "learning_rate": 2.826875e-05,
653
+ "loss": 0.9057,
654
  "step": 8100
655
  },
656
  {
657
+ "epoch": 26.37,
658
+ "learning_rate": 2.7609027777777785e-05,
659
+ "loss": 0.8956,
660
  "step": 8200
661
  },
662
  {
663
+ "epoch": 26.69,
664
+ "learning_rate": 2.694930555555556e-05,
665
+ "loss": 0.9088,
666
  "step": 8300
667
  },
668
  {
669
+ "epoch": 27.01,
670
+ "learning_rate": 2.6289583333333333e-05,
671
+ "loss": 0.8997,
672
  "step": 8400
673
  },
674
  {
675
+ "epoch": 27.33,
676
+ "learning_rate": 2.5629861111111116e-05,
677
+ "loss": 0.8848,
678
  "step": 8500
679
  },
680
  {
681
+ "epoch": 27.33,
682
+ "eval_cer": 0.03840788341099997,
683
+ "eval_loss": 0.1298777312040329,
684
+ "eval_runtime": 197.318,
685
+ "eval_samples_per_second": 29.404,
686
+ "eval_steps_per_second": 0.461,
687
+ "eval_wer": 0.20281749964604276,
688
  "step": 8500
689
  },
690
  {
691
+ "epoch": 27.65,
692
+ "learning_rate": 2.4970138888888895e-05,
693
+ "loss": 0.8926,
694
  "step": 8600
695
  },
696
  {
697
+ "epoch": 27.97,
698
+ "learning_rate": 2.431041666666667e-05,
699
+ "loss": 0.8802,
700
  "step": 8700
701
  },
702
  {
703
+ "epoch": 28.3,
704
+ "learning_rate": 2.365069444444445e-05,
705
+ "loss": 0.8784,
706
  "step": 8800
707
  },
708
  {
709
+ "epoch": 28.62,
710
+ "learning_rate": 2.2990972222222225e-05,
711
+ "loss": 0.8749,
712
  "step": 8900
713
  },
714
  {
715
+ "epoch": 28.94,
716
+ "learning_rate": 2.2331250000000004e-05,
717
+ "loss": 0.8667,
718
  "step": 9000
719
  },
720
  {
721
+ "epoch": 28.94,
722
+ "eval_cer": 0.03673829806179692,
723
+ "eval_loss": 0.12283530086278915,
724
+ "eval_runtime": 199.3855,
725
+ "eval_samples_per_second": 29.099,
726
+ "eval_steps_per_second": 0.456,
727
+ "eval_wer": 0.1945113030345934,
728
  "step": 9000
729
  },
730
  {
731
+ "epoch": 29.26,
732
+ "learning_rate": 2.1671527777777783e-05,
733
+ "loss": 0.8628,
734
  "step": 9100
735
  },
736
  {
737
+ "epoch": 29.58,
738
+ "learning_rate": 2.101180555555556e-05,
739
+ "loss": 0.8775,
740
  "step": 9200
741
  },
742
  {
743
+ "epoch": 29.9,
744
+ "learning_rate": 2.0352083333333338e-05,
745
+ "loss": 0.8661,
746
  "step": 9300
747
  },
748
  {
749
+ "epoch": 30.23,
750
+ "learning_rate": 1.9692361111111114e-05,
751
+ "loss": 0.8624,
752
  "step": 9400
753
  },
754
  {
755
+ "epoch": 30.55,
756
+ "learning_rate": 1.9032638888888893e-05,
757
+ "loss": 0.8641,
758
  "step": 9500
759
  },
760
  {
761
+ "epoch": 30.55,
762
+ "eval_cer": 0.036352438336647766,
763
+ "eval_loss": 0.12234856933355331,
764
+ "eval_runtime": 202.2537,
765
+ "eval_samples_per_second": 28.687,
766
+ "eval_steps_per_second": 0.45,
767
+ "eval_wer": 0.19385058284959178,
768
  "step": 9500
769
  },
770
  {
771
+ "epoch": 30.87,
772
+ "learning_rate": 1.837291666666667e-05,
773
+ "loss": 0.8637,
774
  "step": 9600
775
  },
776
  {
777
+ "epoch": 31.19,
778
+ "learning_rate": 1.7713194444444447e-05,
779
+ "loss": 0.8608,
780
  "step": 9700
781
  },
782
  {
783
+ "epoch": 31.51,
784
+ "learning_rate": 1.7053472222222226e-05,
785
+ "loss": 0.8556,
786
  "step": 9800
787
  },
788
  {
789
+ "epoch": 31.83,
790
+ "learning_rate": 1.6393750000000002e-05,
791
+ "loss": 0.854,
792
  "step": 9900
793
  },
794
  {
795
+ "epoch": 32.15,
796
+ "learning_rate": 1.573402777777778e-05,
797
+ "loss": 0.8516,
798
  "step": 10000
799
  },
800
  {
801
+ "epoch": 32.15,
802
+ "eval_cer": 0.03494627645365231,
803
+ "eval_loss": 0.11841931194067001,
804
+ "eval_runtime": 199.2371,
805
+ "eval_samples_per_second": 29.121,
806
+ "eval_steps_per_second": 0.457,
807
+ "eval_wer": 0.18762093539100477,
808
  "step": 10000
809
  },
810
  {
811
+ "epoch": 32.48,
812
+ "learning_rate": 1.5074305555555557e-05,
813
+ "loss": 0.8433,
814
+ "step": 10100
815
+ },
816
+ {
817
+ "epoch": 32.8,
818
+ "learning_rate": 1.4414583333333338e-05,
819
+ "loss": 0.8507,
820
+ "step": 10200
821
+ },
822
+ {
823
+ "epoch": 33.12,
824
+ "learning_rate": 1.3754861111111117e-05,
825
+ "loss": 0.8419,
826
+ "step": 10300
827
+ },
828
+ {
829
+ "epoch": 33.44,
830
+ "learning_rate": 1.3095138888888892e-05,
831
+ "loss": 0.8344,
832
+ "step": 10400
833
+ },
834
+ {
835
+ "epoch": 33.76,
836
+ "learning_rate": 1.2435416666666671e-05,
837
+ "loss": 0.8379,
838
+ "step": 10500
839
+ },
840
+ {
841
+ "epoch": 33.76,
842
+ "eval_cer": 0.03375159538155591,
843
+ "eval_loss": 0.11372008919715881,
844
+ "eval_runtime": 199.4785,
845
+ "eval_samples_per_second": 29.086,
846
+ "eval_steps_per_second": 0.456,
847
+ "eval_wer": 0.18207560526688377,
848
+ "step": 10500
849
+ },
850
+ {
851
+ "epoch": 34.08,
852
+ "learning_rate": 1.1782291666666672e-05,
853
+ "loss": 0.8302,
854
+ "step": 10600
855
+ },
856
+ {
857
+ "epoch": 34.41,
858
+ "learning_rate": 1.1122569444444448e-05,
859
+ "loss": 0.8294,
860
+ "step": 10700
861
+ },
862
+ {
863
+ "epoch": 34.73,
864
+ "learning_rate": 1.0462847222222227e-05,
865
+ "loss": 0.8225,
866
+ "step": 10800
867
+ },
868
+ {
869
+ "epoch": 35.05,
870
+ "learning_rate": 9.803125000000001e-06,
871
+ "loss": 0.8237,
872
+ "step": 10900
873
+ },
874
+ {
875
+ "epoch": 35.37,
876
+ "learning_rate": 9.143402777777782e-06,
877
+ "loss": 0.8235,
878
+ "step": 11000
879
+ },
880
+ {
881
+ "epoch": 35.37,
882
+ "eval_cer": 0.03308005105220979,
883
+ "eval_loss": 0.11269930005073547,
884
+ "eval_runtime": 198.8276,
885
+ "eval_samples_per_second": 29.181,
886
+ "eval_steps_per_second": 0.458,
887
+ "eval_wer": 0.1778753126622304,
888
+ "step": 11000
889
+ },
890
+ {
891
+ "epoch": 35.69,
892
+ "learning_rate": 8.483680555555563e-06,
893
+ "loss": 0.8205,
894
+ "step": 11100
895
+ },
896
+ {
897
+ "epoch": 36.01,
898
+ "learning_rate": 7.823958333333337e-06,
899
+ "loss": 0.826,
900
+ "step": 11200
901
+ },
902
+ {
903
+ "epoch": 36.33,
904
+ "learning_rate": 7.1642361111111165e-06,
905
+ "loss": 0.8207,
906
+ "step": 11300
907
+ },
908
+ {
909
+ "epoch": 36.66,
910
+ "learning_rate": 6.504513888888891e-06,
911
+ "loss": 0.8129,
912
+ "step": 11400
913
+ },
914
+ {
915
+ "epoch": 36.98,
916
+ "learning_rate": 5.844791666666671e-06,
917
+ "loss": 0.8112,
918
+ "step": 11500
919
+ },
920
+ {
921
+ "epoch": 36.98,
922
+ "eval_cer": 0.03268677094773085,
923
+ "eval_loss": 0.11033473163843155,
924
+ "eval_runtime": 201.8103,
925
+ "eval_samples_per_second": 28.75,
926
+ "eval_steps_per_second": 0.451,
927
+ "eval_wer": 0.17662466374062014,
928
+ "step": 11500
929
+ },
930
+ {
931
+ "epoch": 37.3,
932
+ "learning_rate": 5.185069444444451e-06,
933
+ "loss": 0.805,
934
+ "step": 11600
935
+ },
936
+ {
937
+ "epoch": 37.62,
938
+ "learning_rate": 4.525347222222226e-06,
939
+ "loss": 0.8108,
940
+ "step": 11700
941
+ },
942
+ {
943
+ "epoch": 37.94,
944
+ "learning_rate": 3.865625000000006e-06,
945
+ "loss": 0.8025,
946
+ "step": 11800
947
+ },
948
+ {
949
+ "epoch": 38.26,
950
+ "learning_rate": 3.2059027777777807e-06,
951
+ "loss": 0.8018,
952
+ "step": 11900
953
+ },
954
+ {
955
+ "epoch": 38.59,
956
+ "learning_rate": 2.5461805555555606e-06,
957
+ "loss": 0.8069,
958
+ "step": 12000
959
+ },
960
+ {
961
+ "epoch": 38.59,
962
+ "eval_cer": 0.032260099136267845,
963
+ "eval_loss": 0.10924588888883591,
964
+ "eval_runtime": 199.7196,
965
+ "eval_samples_per_second": 29.051,
966
+ "eval_steps_per_second": 0.456,
967
+ "eval_wer": 0.17520883477275945,
968
+ "step": 12000
969
+ },
970
+ {
971
+ "epoch": 38.59,
972
+ "step": 12000,
973
+ "total_flos": 1.0363087195555613e+21,
974
+ "train_loss": 1.0786900800069172,
975
+ "train_runtime": 133237.4383,
976
+ "train_samples_per_second": 11.528,
977
+ "train_steps_per_second": 0.09
978
  }
979
  ],
980
+ "max_steps": 12000,
981
+ "num_train_epochs": 39,
982
+ "total_flos": 1.0363087195555613e+21,
983
  "trial_name": null,
984
  "trial_params": null
985
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:004085d8188230d36e53e873905f4d0b969a1ce5224f8a4bcbea3db77af402c8
3
  size 3055
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bc1a8adc9b904ef39fbc4de06d5ac7b060a28d098cdb7eb65adcb5cd1d34935
3
  size 3055