m3hrdadfi commited on
Commit
aab8b6b
1 Parent(s): 9eb477e

Initial model

Browse files
README.md ADDED
@@ -0,0 +1,306 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: tr
3
+ datasets:
4
+ - common_voice
5
+ tags:
6
+ - audio
7
+ - automatic-speech-recognition
8
+ - speech
9
+ - xlsr-fine-tuning-week
10
+ license: apache-2.0
11
+ widget:
12
+ - label: Common Voice sample 1378
13
+ src: https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-turkish/resolve/main/sample1378.flac
14
+ - label: Common Voice sample 1589
15
+ src: https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-turkish/resolve/main/sample1589.flac
16
+ model-index:
17
+ - name: XLSR Wav2Vec2 Turkish by Mehrdad Farahani
18
+ results:
19
+ - task:
20
+ name: Speech Recognition
21
+ type: automatic-speech-recognition
22
+ dataset:
23
+ name: Common Voice tr
24
+ type: common_voice
25
+ args: tr
26
+ metrics:
27
+ - name: Test WER
28
+ type: wer
29
+ value: 27.51
30
+
31
+ ---
32
+
33
+ # Wav2Vec2-Large-XLSR-53-Turkish
34
+
35
+ Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) in Turkish using [Common Voice](https://huggingface.co/datasets/common_voice). When using this model, make sure that your speech input is sampled at 16kHz.
36
+
37
+ ## Usage
38
+ The model can be used directly (without a language model) as follows:
39
+
40
+ **Requirements**
41
+ ```bash
42
+ # requirement packages
43
+ !pip install git+https://github.com/huggingface/datasets.git
44
+ !pip install git+https://github.com/huggingface/transformers.git
45
+ !pip install torchaudio
46
+ !pip install librosa
47
+ !pip install jiwer
48
+ ```
49
+
50
+
51
+ **Prediction**
52
+ ```python
53
+ import librosa
54
+ import torch
55
+ import torchaudio
56
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
57
+ from datasets import load_dataset
58
+
59
+ import numpy as np
60
+ import re
61
+ import string
62
+
63
+ import IPython.display as ipd
64
+
65
+ chars_to_ignore = [
66
+ ",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�",
67
+ "#", "!", "?", "«", "»", "(", ")", "؛", ",", "?", ".", "!", "-", ";", ":", '"',
68
+ "“", "%", "‘", "�", "–", "…", "_", "”", '“', '„'
69
+ ]
70
+ chars_to_mapping = {
71
+ "\u200c": " ", "\u200d": " ", "\u200e": " ", "\u200f": " ", "\ufeff": " ",
72
+ }
73
+
74
+ def multiple_replace(text, chars_to_mapping):
75
+ pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
76
+ return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))
77
+
78
+ def remove_special_characters(text, chars_to_ignore_regex):
79
+ text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
80
+ return text
81
+
82
+ def normalizer(batch, chars_to_ignore, chars_to_mapping):
83
+ chars_to_ignore_regex = f"""[{"".join(chars_to_ignore)}]"""
84
+ text = batch["sentence"].lower().strip()
85
+
86
+ text = text.replace("\u0307", " ").strip()
87
+ text = multiple_replace(text, chars_to_mapping)
88
+ text = remove_special_characters(text, chars_to_ignore_regex)
89
+
90
+ batch["sentence"] = text
91
+ return batch
92
+
93
+
94
+ def speech_file_to_array_fn(batch):
95
+ speech_array, sampling_rate = torchaudio.load(batch["path"])
96
+ speech_array = speech_array.squeeze().numpy()
97
+ speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, 16_000)
98
+
99
+ batch["speech"] = speech_array
100
+ return batch
101
+
102
+
103
+ def predict(batch):
104
+ features = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
105
+
106
+ input_values = features.input_values.to(device)
107
+ attention_mask = features.attention_mask.to(device)
108
+
109
+ with torch.no_grad():
110
+ logits = model(input_values, attention_mask=attention_mask).logits
111
+
112
+ pred_ids = torch.argmax(logits, dim=-1)
113
+
114
+ batch["predicted"] = processor.batch_decode(pred_ids)[0]
115
+ return batch
116
+
117
+
118
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
119
+ processor = Wav2Vec2Processor.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-turkish")
120
+ model = Wav2Vec2ForCTC.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-turkish").to(device)
121
+
122
+ dataset = load_dataset("common_voice", "et", split="test[:1%]")
123
+ dataset = dataset.map(
124
+ normalizer,
125
+ fn_kwargs={"chars_to_ignore": chars_to_ignore, "chars_to_mapping": chars_to_mapping},
126
+ remove_columns=list(set(dataset.column_names) - set(['sentence', 'path']))
127
+ )
128
+
129
+ dataset = dataset.map(speech_file_to_array_fn)
130
+ result = dataset.map(predict)
131
+
132
+ max_items = np.random.randint(0, len(result), 10).tolist()
133
+ for i in max_items:
134
+ reference, predicted = result["sentence"][i], result["predicted"][i]
135
+ print("reference:", reference)
136
+ print("predicted:", predicted)
137
+ print('---')
138
+ ```
139
+
140
+ **Output:**
141
+ ```text
142
+ reference: ülke şu anda iki federasyona üye
143
+ predicted: ülke şu anda iki federasyona üye
144
+ ---
145
+ reference: foruma dört yüzde fazla kişi katıldı
146
+ predicted: soruma dört yüzden fazla kişi katıldı
147
+ ---
148
+ reference: mobi altmış üç çalışanları da mutsuz
149
+ predicted: mobia haltmış üç çalışanları da mutsur
150
+ ---
151
+ reference: kentin mali esnekliğinin düşük olduğu bildirildi
152
+ predicted: kentin mali esnekleğinin düşük olduğu bildirildi
153
+ ---
154
+ reference: fouere iki ülkeyi sorunu abartmamaya çağırdı
155
+ predicted: foor iki ülkeyi soruna abartmamaya çanayordı
156
+ ---
157
+ reference: o ülkeden herhangi bir tepki geldi mi
158
+ predicted: o ülkeden herhayın bir tepki geldi mi
159
+ ---
160
+ reference: bunlara asla sırtımızı dönmeyeceğiz
161
+ predicted: bunlara asla sırtımızı dönmeyeceğiz
162
+ ---
163
+ reference: sizi ayakta tutan nedir
164
+ predicted: sizi ayakta tutan nedir
165
+ ---
166
+ reference: artık insanlar daha bireysel yaşıyor
167
+ predicted: artık insanlar daha bir eyselli yaşıyor
168
+ ---
169
+ reference: her ikisi de diyaloga hazır olduğunu söylüyor
170
+ predicted: her ikisi de diyaloğa hazır olduğunu söylüyor
171
+ ---
172
+ reference: merkez bankasının başlıca amacı düşük enflasyon
173
+ predicted: merkez bankasının başlrıca anatı güşükyen flasyon
174
+ ---
175
+ reference: firefox
176
+ predicted: fair foks
177
+ ---
178
+ reference: ülke halkı çok misafirsever ve dışa dönük
179
+ predicted: ülke halktı çok isatirtever ve dışa dönük
180
+ ---
181
+ reference: ancak kamuoyu bu durumu pek de affetmiyor
182
+ predicted: ancak kamuonyulgukirmu pek deafıf etmiyor
183
+ ---
184
+ reference: i ki madende iki bin beş yüzden fazla kişi çalışıyor
185
+ predicted: i ki madende iki bin beş yüzden fazla kişi çalışıyor
186
+ ---
187
+ reference: sunnyside park dışarıdan oldukça iyi görünüyor
188
+ predicted: sani sahip park dışarıdan oldukça iyi görünüyor
189
+ ---
190
+ reference: büyük ödül on beş bin avro
191
+ predicted: büyük ödül on beş bin avro
192
+ ---
193
+ reference: köyümdeki camiler depoya dönüştürüldü
194
+ predicted: küyümdeki camiler depoya dönüştürüldü
195
+ ---
196
+ reference: maç oldukça diplomatik bir sonuçla birbir bitti
197
+ predicted: maç oldukça diplomatik bir sonuçla bir birbitti
198
+ ---
199
+ reference: kuşların ikisi de karantinada öldüler
200
+ predicted: kuşların ikiste karantinada özdüler
201
+ ---
202
+ ```
203
+
204
+
205
+ ## Evaluation
206
+
207
+ The model can be evaluated as follows on the Turkish test data of Common Voice.
208
+
209
+ ```python
210
+ import librosa
211
+ import torch
212
+ import torchaudio
213
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
214
+ from datasets import load_dataset, load_metric
215
+
216
+ import numpy as np
217
+ import re
218
+ import string
219
+
220
+
221
+ chars_to_ignore = [
222
+ ",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�",
223
+ "#", "!", "?", "«", "»", "(", ")", "؛", ",", "?", ".", "!", "-", ";", ":", '"',
224
+ "“", "%", "‘", "�", "–", "…", "_", "”", '“', '„'
225
+ ]
226
+ chars_to_mapping = {
227
+ "\u200c": " ", "\u200d": " ", "\u200e": " ", "\u200f": " ", "\ufeff": " ",
228
+ "\u0307": " "
229
+ }
230
+
231
+ def multiple_replace(text, chars_to_mapping):
232
+ pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
233
+ return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))
234
+
235
+ def remove_special_characters(text, chars_to_ignore_regex):
236
+ text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
237
+ return text
238
+
239
+ def normalizer(batch, chars_to_ignore, chars_to_mapping):
240
+ chars_to_ignore_regex = f"""[{"".join(chars_to_ignore)}]"""
241
+ text = batch["sentence"].lower().strip()
242
+
243
+ text = text.replace("\u0307", " ").strip()
244
+ text = multiple_replace(text, chars_to_mapping)
245
+ text = remove_special_characters(text, chars_to_ignore_regex)
246
+ text = re.sub(" +", " ", text)
247
+ text = text.strip() + " "
248
+
249
+ batch["sentence"] = text
250
+ return batch
251
+
252
+
253
+ def speech_file_to_array_fn(batch):
254
+ speech_array, sampling_rate = torchaudio.load(batch["path"])
255
+ speech_array = speech_array.squeeze().numpy()
256
+ speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, 16_000)
257
+
258
+ batch["speech"] = speech_array
259
+ return batch
260
+
261
+
262
+ def predict(batch):
263
+ features = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
264
+
265
+ input_values = features.input_values.to(device)
266
+ attention_mask = features.attention_mask.to(device)
267
+
268
+ with torch.no_grad():
269
+ logits = model(input_values, attention_mask=attention_mask).logits
270
+
271
+ pred_ids = torch.argmax(logits, dim=-1)
272
+
273
+ batch["predicted"] = processor.batch_decode(pred_ids)[0]
274
+ return batch
275
+
276
+
277
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
278
+ processor = Wav2Vec2Processor.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-turkish")
279
+ model = Wav2Vec2ForCTC.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-turkish").to(device)
280
+
281
+ dataset = load_dataset("common_voice", "tr", split="test")
282
+ dataset = dataset.map(
283
+ normalizer,
284
+ fn_kwargs={"chars_to_ignore": chars_to_ignore, "chars_to_mapping": chars_to_mapping},
285
+ remove_columns=list(set(dataset.column_names) - set(['sentence', 'path']))
286
+ )
287
+
288
+ dataset = dataset.map(speech_file_to_array_fn)
289
+ result = dataset.map(predict)
290
+
291
+ wer = load_metric("wer")
292
+
293
+ print("WER: {:.2f}".format(100 * wer.compute(predictions=result["predicted"], references=result["sentence"])))
294
+ ```
295
+ ]
296
+
297
+ **Test Result**:
298
+ - WER: 27.51%
299
+
300
+
301
+ ## Training & Report
302
+ The Common Voice `train`, `validation` datasets were used for training.
303
+
304
+ You can see the training states [here](https://wandb.ai/m3hrdadfi/finetuned_wav2vec_xlsr_turkish/reports/Fine-Tuning-for-Wav2Vec2-Large-XLSR-53-Turkish--Vmlldzo1Njc1MDc?accessToken=02vm5cwbi7d342vyt7h9w9859zex0enltdmjoreyjt3bd5qwv0vs0g3u93iv92q0)
305
+
306
+ The script used for training can be found [here](https://colab.research.google.com/github/m3hrdadfi/notebooks/blob/main/Fine_Tune_XLSR_Wav2Vec2_on_Turkish_ASR_with_%F0%9F%A4%97_Transformers_ipynb.ipynb)
all_results.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 79.99,
3
+ "eval_loss": 0.43752962350845337,
4
+ "eval_mem_cpu_alloc_delta": 135991777,
5
+ "eval_mem_cpu_peaked_delta": 12055738,
6
+ "eval_mem_gpu_alloc_delta": 0,
7
+ "eval_mem_gpu_peaked_delta": 5735247360,
8
+ "eval_runtime": 168.8086,
9
+ "eval_samples": 1647,
10
+ "eval_samples_per_second": 9.757,
11
+ "eval_wer": 0.29045685279187816,
12
+ "init_mem_cpu_alloc_delta": 51353,
13
+ "init_mem_cpu_peaked_delta": 18306,
14
+ "init_mem_gpu_alloc_delta": 0,
15
+ "init_mem_gpu_peaked_delta": 0,
16
+ "total_flos": 3.4759190857567523e+19,
17
+ "train_mem_cpu_alloc_delta": 34575838,
18
+ "train_mem_cpu_peaked_delta": 171805020,
19
+ "train_mem_gpu_alloc_delta": 3790510080,
20
+ "train_mem_gpu_peaked_delta": 6257791488,
21
+ "train_runtime": 32940.4351,
22
+ "train_samples": 3478,
23
+ "train_samples_per_second": 0.151
24
+ }
config.json ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/wav2vec2-large-xlsr-53",
3
+ "activation_dropout": 0.0,
4
+ "apply_spec_augment": true,
5
+ "architectures": [
6
+ "Wav2Vec2ForCTC"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "bos_token_id": 1,
10
+ "conv_bias": true,
11
+ "conv_dim": [
12
+ 512,
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512
19
+ ],
20
+ "conv_kernel": [
21
+ 10,
22
+ 3,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 2,
27
+ 2
28
+ ],
29
+ "conv_stride": [
30
+ 5,
31
+ 2,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2
37
+ ],
38
+ "ctc_loss_reduction": "mean",
39
+ "ctc_zero_infinity": true,
40
+ "do_stable_layer_norm": true,
41
+ "eos_token_id": 2,
42
+ "feat_extract_activation": "gelu",
43
+ "feat_extract_dropout": 0.0,
44
+ "feat_extract_norm": "layer",
45
+ "feat_proj_dropout": 0.0,
46
+ "final_dropout": 0.0,
47
+ "gradient_checkpointing": true,
48
+ "hidden_act": "gelu",
49
+ "hidden_dropout": 0.1,
50
+ "hidden_size": 1024,
51
+ "initializer_range": 0.02,
52
+ "intermediate_size": 4096,
53
+ "layer_norm_eps": 1e-05,
54
+ "layerdrop": 0.1,
55
+ "mask_channel_length": 10,
56
+ "mask_channel_min_space": 1,
57
+ "mask_channel_other": 0.0,
58
+ "mask_channel_prob": 0.0,
59
+ "mask_channel_selection": "static",
60
+ "mask_feature_length": 10,
61
+ "mask_feature_prob": 0.0,
62
+ "mask_time_length": 10,
63
+ "mask_time_min_space": 1,
64
+ "mask_time_other": 0.0,
65
+ "mask_time_prob": 0.05,
66
+ "mask_time_selection": "static",
67
+ "model_type": "wav2vec2",
68
+ "num_attention_heads": 16,
69
+ "num_conv_pos_embedding_groups": 16,
70
+ "num_conv_pos_embeddings": 128,
71
+ "num_feat_extract_layers": 7,
72
+ "num_hidden_layers": 24,
73
+ "pad_token_id": 0,
74
+ "transformers_version": "4.5.0.dev0",
75
+ "vocab_size": 40
76
+ }
eval_results.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 79.99,
3
+ "eval_loss": 0.43752962350845337,
4
+ "eval_mem_cpu_alloc_delta": 135991777,
5
+ "eval_mem_cpu_peaked_delta": 12055738,
6
+ "eval_mem_gpu_alloc_delta": 0,
7
+ "eval_mem_gpu_peaked_delta": 5735247360,
8
+ "eval_runtime": 168.8086,
9
+ "eval_samples": 1647,
10
+ "eval_samples_per_second": 9.757,
11
+ "eval_wer": 0.29045685279187816
12
+ }
predictions.csv ADDED
The diff for this file is too large to render. See raw diff
preprocessor_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_size": 1,
4
+ "padding_side": "right",
5
+ "padding_value": 0.0,
6
+ "return_attention_mask": true,
7
+ "sampling_rate": 16000
8
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7231dcadd7d585fe2d05a7dc13addf8d6c70d29a6b781f2a4a7c0d42e674225
3
+ size 1262097815
result.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1e91eac457062cde32dd686155804964e2a496d34a50d274ff5b92478d287e9
3
+ size 3183
sample1378.flac ADDED
Binary file (70 kB). View file
sample1589.flac ADDED
Binary file (57.3 kB). View file
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
1
+ {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>", "do_lower_case": false, "word_delimiter_token": "|"}
train_results.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 79.99,
3
+ "init_mem_cpu_alloc_delta": 51353,
4
+ "init_mem_cpu_peaked_delta": 18306,
5
+ "init_mem_gpu_alloc_delta": 0,
6
+ "init_mem_gpu_peaked_delta": 0,
7
+ "total_flos": 3.4759190857567523e+19,
8
+ "train_mem_cpu_alloc_delta": 34575838,
9
+ "train_mem_cpu_peaked_delta": 171805020,
10
+ "train_mem_gpu_alloc_delta": 3790510080,
11
+ "train_mem_gpu_peaked_delta": 6257791488,
12
+ "train_runtime": 32940.4351,
13
+ "train_samples": 3478,
14
+ "train_samples_per_second": 0.151
15
+ }
trainer_state.json ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 79.992,
5
+ "global_step": 4960,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 4.83,
12
+ "learning_rate": 0.00017999999999999998,
13
+ "loss": 5.8789,
14
+ "step": 300
15
+ },
16
+ {
17
+ "epoch": 4.83,
18
+ "eval_loss": 3.1512646675109863,
19
+ "eval_runtime": 167.7164,
20
+ "eval_samples_per_second": 9.82,
21
+ "eval_wer": 1.0,
22
+ "step": 300
23
+ },
24
+ {
25
+ "epoch": 9.67,
26
+ "learning_rate": 0.00029327354260089687,
27
+ "loss": 1.8544,
28
+ "step": 600
29
+ },
30
+ {
31
+ "epoch": 9.67,
32
+ "eval_loss": 0.4943191707134247,
33
+ "eval_runtime": 170.9272,
34
+ "eval_samples_per_second": 9.636,
35
+ "eval_wer": 0.5103553299492386,
36
+ "step": 600
37
+ },
38
+ {
39
+ "epoch": 14.51,
40
+ "learning_rate": 0.0002730941704035874,
41
+ "loss": 0.2628,
42
+ "step": 900
43
+ },
44
+ {
45
+ "epoch": 14.51,
46
+ "eval_loss": 0.46906277537345886,
47
+ "eval_runtime": 172.4339,
48
+ "eval_samples_per_second": 9.551,
49
+ "eval_wer": 0.43147208121827413,
50
+ "step": 900
51
+ },
52
+ {
53
+ "epoch": 19.35,
54
+ "learning_rate": 0.000252914798206278,
55
+ "loss": 0.124,
56
+ "step": 1200
57
+ },
58
+ {
59
+ "epoch": 19.35,
60
+ "eval_loss": 0.4130130708217621,
61
+ "eval_runtime": 172.8773,
62
+ "eval_samples_per_second": 9.527,
63
+ "eval_wer": 0.36,
64
+ "step": 1200
65
+ },
66
+ {
67
+ "epoch": 24.19,
68
+ "learning_rate": 0.0002327354260089686,
69
+ "loss": 0.0857,
70
+ "step": 1500
71
+ },
72
+ {
73
+ "epoch": 24.19,
74
+ "eval_loss": 0.43019285798072815,
75
+ "eval_runtime": 176.7931,
76
+ "eval_samples_per_second": 9.316,
77
+ "eval_wer": 0.3513705583756345,
78
+ "step": 1500
79
+ },
80
+ {
81
+ "epoch": 29.03,
82
+ "learning_rate": 0.00021255605381165918,
83
+ "loss": 0.0653,
84
+ "step": 1800
85
+ },
86
+ {
87
+ "epoch": 29.03,
88
+ "eval_loss": 0.4362075626850128,
89
+ "eval_runtime": 190.6223,
90
+ "eval_samples_per_second": 8.64,
91
+ "eval_wer": 0.3595939086294416,
92
+ "step": 1800
93
+ },
94
+ {
95
+ "epoch": 33.86,
96
+ "learning_rate": 0.00019237668161434975,
97
+ "loss": 0.0632,
98
+ "step": 2100
99
+ },
100
+ {
101
+ "epoch": 33.86,
102
+ "eval_loss": 0.4273272156715393,
103
+ "eval_runtime": 177.8151,
104
+ "eval_samples_per_second": 9.262,
105
+ "eval_wer": 0.33390862944162436,
106
+ "step": 2100
107
+ },
108
+ {
109
+ "epoch": 38.7,
110
+ "learning_rate": 0.00017219730941704035,
111
+ "loss": 0.0499,
112
+ "step": 2400
113
+ },
114
+ {
115
+ "epoch": 38.7,
116
+ "eval_loss": 0.4455905258655548,
117
+ "eval_runtime": 177.4805,
118
+ "eval_samples_per_second": 9.28,
119
+ "eval_wer": 0.32558375634517767,
120
+ "step": 2400
121
+ },
122
+ {
123
+ "epoch": 43.54,
124
+ "learning_rate": 0.00015201793721973095,
125
+ "loss": 0.0412,
126
+ "step": 2700
127
+ },
128
+ {
129
+ "epoch": 43.54,
130
+ "eval_loss": 0.4279979467391968,
131
+ "eval_runtime": 179.3129,
132
+ "eval_samples_per_second": 9.185,
133
+ "eval_wer": 0.3316751269035533,
134
+ "step": 2700
135
+ },
136
+ {
137
+ "epoch": 48.38,
138
+ "learning_rate": 0.00013183856502242152,
139
+ "loss": 0.0428,
140
+ "step": 3000
141
+ },
142
+ {
143
+ "epoch": 48.38,
144
+ "eval_loss": 0.42648839950561523,
145
+ "eval_runtime": 179.9053,
146
+ "eval_samples_per_second": 9.155,
147
+ "eval_wer": 0.3197969543147208,
148
+ "step": 3000
149
+ },
150
+ {
151
+ "epoch": 53.22,
152
+ "learning_rate": 0.0001116591928251121,
153
+ "loss": 0.0345,
154
+ "step": 3300
155
+ },
156
+ {
157
+ "epoch": 53.22,
158
+ "eval_loss": 0.46762773394584656,
159
+ "eval_runtime": 179.9553,
160
+ "eval_samples_per_second": 9.152,
161
+ "eval_wer": 0.31228426395939085,
162
+ "step": 3300
163
+ },
164
+ {
165
+ "epoch": 58.06,
166
+ "learning_rate": 9.147982062780269e-05,
167
+ "loss": 0.0335,
168
+ "step": 3600
169
+ },
170
+ {
171
+ "epoch": 58.06,
172
+ "eval_loss": 0.4403984248638153,
173
+ "eval_runtime": 181.1578,
174
+ "eval_samples_per_second": 9.092,
175
+ "eval_wer": 0.3096446700507614,
176
+ "step": 3600
177
+ },
178
+ {
179
+ "epoch": 62.9,
180
+ "learning_rate": 7.130044843049327e-05,
181
+ "loss": 0.0308,
182
+ "step": 3900
183
+ },
184
+ {
185
+ "epoch": 62.9,
186
+ "eval_loss": 0.4584444463253021,
187
+ "eval_runtime": 182.2608,
188
+ "eval_samples_per_second": 9.036,
189
+ "eval_wer": 0.3082233502538071,
190
+ "step": 3900
191
+ },
192
+ {
193
+ "epoch": 67.74,
194
+ "learning_rate": 5.112107623318385e-05,
195
+ "loss": 0.0253,
196
+ "step": 4200
197
+ },
198
+ {
199
+ "epoch": 67.74,
200
+ "eval_loss": 0.4203069806098938,
201
+ "eval_runtime": 182.5783,
202
+ "eval_samples_per_second": 9.021,
203
+ "eval_wer": 0.3001015228426396,
204
+ "step": 4200
205
+ },
206
+ {
207
+ "epoch": 72.58,
208
+ "learning_rate": 3.094170403587444e-05,
209
+ "loss": 0.0243,
210
+ "step": 4500
211
+ },
212
+ {
213
+ "epoch": 72.58,
214
+ "eval_loss": 0.43954339623451233,
215
+ "eval_runtime": 183.1424,
216
+ "eval_samples_per_second": 8.993,
217
+ "eval_wer": 0.29736040609137054,
218
+ "step": 4500
219
+ },
220
+ {
221
+ "epoch": 77.42,
222
+ "learning_rate": 1.0762331838565022e-05,
223
+ "loss": 0.0229,
224
+ "step": 4800
225
+ },
226
+ {
227
+ "epoch": 77.42,
228
+ "eval_loss": 0.44123971462249756,
229
+ "eval_runtime": 183.8976,
230
+ "eval_samples_per_second": 8.956,
231
+ "eval_wer": 0.29218274111675124,
232
+ "step": 4800
233
+ },
234
+ {
235
+ "epoch": 79.99,
236
+ "step": 4960,
237
+ "total_flos": 3.4759190857567523e+19,
238
+ "train_runtime": 32940.4351,
239
+ "train_samples_per_second": 0.151
240
+ }
241
+ ],
242
+ "max_steps": 4960,
243
+ "num_train_epochs": 80,
244
+ "total_flos": 3.4759190857567523e+19,
245
+ "trial_name": null,
246
+ "trial_params": null
247
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7743363a3e402661db5a2194639a678362707f82d18b359e7fe33276471e0169
3
+ size 2351
vocab.json ADDED
@@ -0,0 +1 @@
 
1
+ {"<pad>": 0, "<s>": 1, "</s>": 2, "<unk>": 3, "|": 4, "a": 5, "b": 6, "c": 7, "d": 8, "e": 9, "f": 10, "g": 11, "h": 12, "i": 13, "j": 14, "k": 15, "l": 16, "m": 17, "n": 18, "o": 19, "p": 20, "q": 21, "r": 22, "s": 23, "t": 24, "u": 25, "v": 26, "w": 27, "x": 28, "y": 29, "z": 30, "â": 31, "ç": 32, "ë": 33, "î": 34, "ö": 35, "ü": 36, "ğ": 37, "ı": 38, "ş": 39}