lighteternal commited on
Commit
2c4a267
1 Parent(s): a9d1c20

Added new model trained on 60 epochs

Browse files
ASR_Inference.ipynb CHANGED
@@ -2,11 +2,11 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": null,
6
  "metadata": {
7
  "ExecuteTime": {
8
- "end_time": "2021-03-14T09:33:41.892030Z",
9
- "start_time": "2021-03-14T09:33:40.729163Z"
10
  }
11
  },
12
  "outputs": [],
@@ -24,11 +24,11 @@
24
  },
25
  {
26
  "cell_type": "code",
27
- "execution_count": null,
28
  "metadata": {
29
  "ExecuteTime": {
30
- "end_time": "2021-03-14T09:33:41.909851Z",
31
- "start_time": "2021-03-14T09:33:41.906327Z"
32
  }
33
  },
34
  "outputs": [],
@@ -66,40 +66,57 @@
66
  },
67
  {
68
  "cell_type": "code",
69
- "execution_count": null,
70
  "metadata": {
71
  "ExecuteTime": {
72
- "end_time": "2021-03-14T09:33:49.053762Z",
73
- "start_time": "2021-03-14T09:33:41.922683Z"
74
  }
75
  },
76
- "outputs": [],
 
 
 
 
 
 
 
 
77
  "source": [
78
- "model = Wav2Vec2ForCTC.from_pretrained(\"wav2vec2-large-xlsr-greek/checkpoint-9200/\").to(\"cuda\")\n",
79
  "processor = Wav2Vec2Processor.from_pretrained(\"wav2vec2-large-xlsr-greek/\")"
80
  ]
81
  },
82
  {
83
  "cell_type": "code",
84
- "execution_count": null,
85
  "metadata": {
86
  "ExecuteTime": {
87
- "end_time": "2021-03-14T09:33:52.413558Z",
88
- "start_time": "2021-03-14T09:33:49.078466Z"
89
  }
90
  },
91
- "outputs": [],
 
 
 
 
 
 
 
 
 
92
  "source": [
93
  "common_voice_test = load_dataset(\"common_voice\", \"el\", data_dir=\"cv-corpus-6.1-2020-12-11\", split=\"test\")"
94
  ]
95
  },
96
  {
97
  "cell_type": "code",
98
- "execution_count": null,
99
  "metadata": {
100
  "ExecuteTime": {
101
- "end_time": "2021-03-14T09:33:52.444418Z",
102
- "start_time": "2021-03-14T09:33:52.441338Z"
103
  }
104
  },
105
  "outputs": [],
@@ -109,87 +126,286 @@
109
  },
110
  {
111
  "cell_type": "code",
112
- "execution_count": null,
113
  "metadata": {
114
  "ExecuteTime": {
115
- "end_time": "2021-03-14T09:33:52.473087Z",
116
- "start_time": "2021-03-14T09:33:52.468014Z"
117
  }
118
  },
119
- "outputs": [],
 
 
 
 
 
 
 
 
120
  "source": [
121
  "common_voice_test = common_voice_test.map(remove_special_characters, remove_columns=[\"sentence\"])"
122
  ]
123
  },
124
  {
125
  "cell_type": "code",
126
- "execution_count": null,
127
  "metadata": {
128
  "ExecuteTime": {
129
- "end_time": "2021-03-14T09:33:52.510377Z",
130
- "start_time": "2021-03-14T09:33:52.501677Z"
131
  }
132
  },
133
- "outputs": [],
 
 
 
 
 
 
 
 
134
  "source": [
135
  "common_voice_test = common_voice_test.map(speech_file_to_array_fn, remove_columns=common_voice_test.column_names)"
136
  ]
137
  },
138
  {
139
  "cell_type": "code",
140
- "execution_count": null,
141
  "metadata": {
142
  "ExecuteTime": {
143
- "end_time": "2021-03-14T09:33:53.321810Z",
144
- "start_time": "2021-03-14T09:33:52.533233Z"
145
  }
146
  },
147
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  "source": [
149
  "common_voice_test = common_voice_test.map(resample, num_proc=8)"
150
  ]
151
  },
152
  {
153
  "cell_type": "code",
154
- "execution_count": null,
155
  "metadata": {
156
  "ExecuteTime": {
157
- "end_time": "2021-03-14T09:33:53.611415Z",
158
- "start_time": "2021-03-14T09:33:53.342487Z"
159
  }
160
  },
161
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  "source": [
163
  "common_voice_test = common_voice_test.map(prepare_dataset, remove_columns=common_voice_test.column_names, batch_size=8, num_proc=8, batched=True)"
164
  ]
165
  },
166
  {
167
  "cell_type": "code",
168
- "execution_count": null,
169
  "metadata": {
170
  "ExecuteTime": {
171
- "end_time": "2021-03-14T09:33:56.243678Z",
172
- "start_time": "2021-03-14T09:33:53.632436Z"
173
  }
174
  },
175
- "outputs": [],
 
 
 
 
 
 
 
 
 
176
  "source": [
177
  "common_voice_test_transcription = load_dataset(\"common_voice\", \"el\", data_dir=\"./cv-corpus-6.1-2020-12-11\", split=\"test\")"
178
  ]
179
  },
180
  {
181
  "cell_type": "code",
182
- "execution_count": null,
183
  "metadata": {
184
  "ExecuteTime": {
185
- "end_time": "2021-03-14T09:36:50.076837Z",
186
- "start_time": "2021-03-14T09:36:24.943947Z"
187
  }
188
  },
189
  "outputs": [],
190
  "source": [
191
  "# Change this value to try inference on different CommonVoice extracts\n",
192
- "example = 123\n",
193
  "\n",
194
  "input_dict = processor(common_voice_test[\"input_values\"][example], return_tensors=\"pt\", sampling_rate=16_000, padding=True)\n",
195
  "\n",
@@ -200,30 +416,35 @@
200
  },
201
  {
202
  "cell_type": "code",
203
- "execution_count": null,
204
  "metadata": {
205
  "ExecuteTime": {
206
- "end_time": "2021-03-14T09:36:50.137886Z",
207
- "start_time": "2021-03-14T09:36:50.134218Z"
208
  }
209
  },
210
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
211
  "source": [
212
  "print(\"Prediction:\")\n",
213
  "print(processor.decode(pred_ids[0]))\n",
214
- "# καμιά φορά τα έπαιρνε και έπαιζε όταν η δουλειά ήταν πιο χαλαρί\n",
215
  "\n",
216
  "print(\"\\nReference:\")\n",
217
  "print(common_voice_test_transcription[\"sentence\"][example].lower())\n",
218
- "# καμιά φορά τα έπαιρνε και έπαιζε όταν η δουλειά ήταν πιο χαλαρή"
219
  ]
220
- },
221
- {
222
- "cell_type": "code",
223
- "execution_count": null,
224
- "metadata": {},
225
- "outputs": [],
226
- "source": []
227
  }
228
  ],
229
  "metadata": {
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 15,
6
  "metadata": {
7
  "ExecuteTime": {
8
+ "end_time": "2021-03-14T18:07:15.328900Z",
9
+ "start_time": "2021-03-14T18:07:15.326838Z"
10
  }
11
  },
12
  "outputs": [],
 
24
  },
25
  {
26
  "cell_type": "code",
27
+ "execution_count": 16,
28
  "metadata": {
29
  "ExecuteTime": {
30
+ "end_time": "2021-03-14T18:07:15.933957Z",
31
+ "start_time": "2021-03-14T18:07:15.927789Z"
32
  }
33
  },
34
  "outputs": [],
 
66
  },
67
  {
68
  "cell_type": "code",
69
+ "execution_count": 17,
70
  "metadata": {
71
  "ExecuteTime": {
72
+ "end_time": "2021-03-14T18:07:22.624226Z",
73
+ "start_time": "2021-03-14T18:07:16.402381Z"
74
  }
75
  },
76
+ "outputs": [
77
+ {
78
+ "name": "stderr",
79
+ "output_type": "stream",
80
+ "text": [
81
+ "Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.\n"
82
+ ]
83
+ }
84
+ ],
85
  "source": [
86
+ "model = Wav2Vec2ForCTC.from_pretrained(\"wav2vec2-large-xlsr-greek/checkpoint-18400/\").to(\"cuda\")\n",
87
  "processor = Wav2Vec2Processor.from_pretrained(\"wav2vec2-large-xlsr-greek/\")"
88
  ]
89
  },
90
  {
91
  "cell_type": "code",
92
+ "execution_count": 18,
93
  "metadata": {
94
  "ExecuteTime": {
95
+ "end_time": "2021-03-14T18:07:25.473609Z",
96
+ "start_time": "2021-03-14T18:07:22.644765Z"
97
  }
98
  },
99
+ "outputs": [
100
+ {
101
+ "name": "stderr",
102
+ "output_type": "stream",
103
+ "text": [
104
+ "Using custom data configuration el-afd0a157f05ee080\n",
105
+ "Reusing dataset common_voice (/home/earendil/.cache/huggingface/datasets/common_voice/el-afd0a157f05ee080/6.1.0/32954a9015faa0d840f6c6894938545c5d12bc5d8936a80079af74bf50d71564)\n"
106
+ ]
107
+ }
108
+ ],
109
  "source": [
110
  "common_voice_test = load_dataset(\"common_voice\", \"el\", data_dir=\"cv-corpus-6.1-2020-12-11\", split=\"test\")"
111
  ]
112
  },
113
  {
114
  "cell_type": "code",
115
+ "execution_count": 19,
116
  "metadata": {
117
  "ExecuteTime": {
118
+ "end_time": "2021-03-14T18:07:25.504511Z",
119
+ "start_time": "2021-03-14T18:07:25.500688Z"
120
  }
121
  },
122
  "outputs": [],
 
126
  },
127
  {
128
  "cell_type": "code",
129
+ "execution_count": 20,
130
  "metadata": {
131
  "ExecuteTime": {
132
+ "end_time": "2021-03-14T18:07:25.540666Z",
133
+ "start_time": "2021-03-14T18:07:25.536214Z"
134
  }
135
  },
136
+ "outputs": [
137
+ {
138
+ "name": "stderr",
139
+ "output_type": "stream",
140
+ "text": [
141
+ "Loading cached processed dataset at /home/earendil/.cache/huggingface/datasets/common_voice/el-afd0a157f05ee080/6.1.0/32954a9015faa0d840f6c6894938545c5d12bc5d8936a80079af74bf50d71564/cache-0ce2ebca66096fff.arrow\n"
142
+ ]
143
+ }
144
+ ],
145
  "source": [
146
  "common_voice_test = common_voice_test.map(remove_special_characters, remove_columns=[\"sentence\"])"
147
  ]
148
  },
149
  {
150
  "cell_type": "code",
151
+ "execution_count": 21,
152
  "metadata": {
153
  "ExecuteTime": {
154
+ "end_time": "2021-03-14T18:07:25.578015Z",
155
+ "start_time": "2021-03-14T18:07:25.568808Z"
156
  }
157
  },
158
+ "outputs": [
159
+ {
160
+ "name": "stderr",
161
+ "output_type": "stream",
162
+ "text": [
163
+ "Loading cached processed dataset at /home/earendil/.cache/huggingface/datasets/common_voice/el-afd0a157f05ee080/6.1.0/32954a9015faa0d840f6c6894938545c5d12bc5d8936a80079af74bf50d71564/cache-38a09981767eff59.arrow\n"
164
+ ]
165
+ }
166
+ ],
167
  "source": [
168
  "common_voice_test = common_voice_test.map(speech_file_to_array_fn, remove_columns=common_voice_test.column_names)"
169
  ]
170
  },
171
  {
172
  "cell_type": "code",
173
+ "execution_count": 22,
174
  "metadata": {
175
  "ExecuteTime": {
176
+ "end_time": "2021-03-14T18:07:26.404914Z",
177
+ "start_time": "2021-03-14T18:07:25.605177Z"
178
  }
179
  },
180
+ "outputs": [
181
+ {
182
+ "name": "stdout",
183
+ "output_type": "stream",
184
+ "text": [
185
+ " "
186
+ ]
187
+ },
188
+ {
189
+ "name": "stderr",
190
+ "output_type": "stream",
191
+ "text": [
192
+ "Loading cached processed dataset at /home/earendil/.cache/huggingface/datasets/common_voice/el-afd0a157f05ee080/6.1.0/32954a9015faa0d840f6c6894938545c5d12bc5d8936a80079af74bf50d71564/cache-ba8c6dd59eb8ccf2.arrow\n"
193
+ ]
194
+ },
195
+ {
196
+ "name": "stdout",
197
+ "output_type": "stream",
198
+ "text": [
199
+ " "
200
+ ]
201
+ },
202
+ {
203
+ "name": "stderr",
204
+ "output_type": "stream",
205
+ "text": [
206
+ "Loading cached processed dataset at /home/earendil/.cache/huggingface/datasets/common_voice/el-afd0a157f05ee080/6.1.0/32954a9015faa0d840f6c6894938545c5d12bc5d8936a80079af74bf50d71564/cache-2e240883a5f827fd.arrow\n"
207
+ ]
208
+ },
209
+ {
210
+ "name": "stdout",
211
+ "output_type": "stream",
212
+ "text": [
213
+ " "
214
+ ]
215
+ },
216
+ {
217
+ "name": "stderr",
218
+ "output_type": "stream",
219
+ "text": [
220
+ "Loading cached processed dataset at /home/earendil/.cache/huggingface/datasets/common_voice/el-afd0a157f05ee080/6.1.0/32954a9015faa0d840f6c6894938545c5d12bc5d8936a80079af74bf50d71564/cache-485c00dc9048ed50.arrow\n"
221
+ ]
222
+ },
223
+ {
224
+ "name": "stdout",
225
+ "output_type": "stream",
226
+ "text": [
227
+ " "
228
+ ]
229
+ },
230
+ {
231
+ "name": "stderr",
232
+ "output_type": "stream",
233
+ "text": [
234
+ "Loading cached processed dataset at /home/earendil/.cache/huggingface/datasets/common_voice/el-afd0a157f05ee080/6.1.0/32954a9015faa0d840f6c6894938545c5d12bc5d8936a80079af74bf50d71564/cache-44bf1791baae8e2e.arrow\n",
235
+ "Loading cached processed dataset at /home/earendil/.cache/huggingface/datasets/common_voice/el-afd0a157f05ee080/6.1.0/32954a9015faa0d840f6c6894938545c5d12bc5d8936a80079af74bf50d71564/cache-ecc0dfac5615a58e.arrow\n"
236
+ ]
237
+ },
238
+ {
239
+ "name": "stdout",
240
+ "output_type": "stream",
241
+ "text": [
242
+ " "
243
+ ]
244
+ },
245
+ {
246
+ "name": "stderr",
247
+ "output_type": "stream",
248
+ "text": [
249
+ "Loading cached processed dataset at /home/earendil/.cache/huggingface/datasets/common_voice/el-afd0a157f05ee080/6.1.0/32954a9015faa0d840f6c6894938545c5d12bc5d8936a80079af74bf50d71564/cache-bb54bb00dae79669.arrow\n",
250
+ "Loading cached processed dataset at /home/earendil/.cache/huggingface/datasets/common_voice/el-afd0a157f05ee080/6.1.0/32954a9015faa0d840f6c6894938545c5d12bc5d8936a80079af74bf50d71564/cache-923d905502a8661d.arrow\n"
251
+ ]
252
+ },
253
+ {
254
+ "name": "stdout",
255
+ "output_type": "stream",
256
+ "text": [
257
+ " "
258
+ ]
259
+ },
260
+ {
261
+ "name": "stderr",
262
+ "output_type": "stream",
263
+ "text": [
264
+ "Loading cached processed dataset at /home/earendil/.cache/huggingface/datasets/common_voice/el-afd0a157f05ee080/6.1.0/32954a9015faa0d840f6c6894938545c5d12bc5d8936a80079af74bf50d71564/cache-062aeafc3b8816c1.arrow\n"
265
+ ]
266
+ }
267
+ ],
268
  "source": [
269
  "common_voice_test = common_voice_test.map(resample, num_proc=8)"
270
  ]
271
  },
272
  {
273
  "cell_type": "code",
274
+ "execution_count": 23,
275
  "metadata": {
276
  "ExecuteTime": {
277
+ "end_time": "2021-03-14T18:07:27.032511Z",
278
+ "start_time": "2021-03-14T18:07:26.432613Z"
279
  }
280
  },
281
+ "outputs": [
282
+ {
283
+ "name": "stdout",
284
+ "output_type": "stream",
285
+ "text": [
286
+ " "
287
+ ]
288
+ },
289
+ {
290
+ "name": "stderr",
291
+ "output_type": "stream",
292
+ "text": [
293
+ "Loading cached processed dataset at /home/earendil/.cache/huggingface/datasets/common_voice/el-afd0a157f05ee080/6.1.0/32954a9015faa0d840f6c6894938545c5d12bc5d8936a80079af74bf50d71564/cache-82be72eab73488a6.arrow\n",
294
+ "Loading cached processed dataset at /home/earendil/.cache/huggingface/datasets/common_voice/el-afd0a157f05ee080/6.1.0/32954a9015faa0d840f6c6894938545c5d12bc5d8936a80079af74bf50d71564/cache-a30edec53656694c.arrow\n"
295
+ ]
296
+ },
297
+ {
298
+ "name": "stdout",
299
+ "output_type": "stream",
300
+ "text": [
301
+ " "
302
+ ]
303
+ },
304
+ {
305
+ "name": "stderr",
306
+ "output_type": "stream",
307
+ "text": [
308
+ "Loading cached processed dataset at /home/earendil/.cache/huggingface/datasets/common_voice/el-afd0a157f05ee080/6.1.0/32954a9015faa0d840f6c6894938545c5d12bc5d8936a80079af74bf50d71564/cache-91aacc366ff3e776.arrow\n"
309
+ ]
310
+ },
311
+ {
312
+ "name": "stdout",
313
+ "output_type": "stream",
314
+ "text": [
315
+ " "
316
+ ]
317
+ },
318
+ {
319
+ "name": "stderr",
320
+ "output_type": "stream",
321
+ "text": [
322
+ "Loading cached processed dataset at /home/earendil/.cache/huggingface/datasets/common_voice/el-afd0a157f05ee080/6.1.0/32954a9015faa0d840f6c6894938545c5d12bc5d8936a80079af74bf50d71564/cache-cce8223f5c38f863.arrow\n"
323
+ ]
324
+ },
325
+ {
326
+ "name": "stdout",
327
+ "output_type": "stream",
328
+ "text": [
329
+ " "
330
+ ]
331
+ },
332
+ {
333
+ "name": "stderr",
334
+ "output_type": "stream",
335
+ "text": [
336
+ "Loading cached processed dataset at /home/earendil/.cache/huggingface/datasets/common_voice/el-afd0a157f05ee080/6.1.0/32954a9015faa0d840f6c6894938545c5d12bc5d8936a80079af74bf50d71564/cache-4f0d5b132b7516de.arrow\n"
337
+ ]
338
+ },
339
+ {
340
+ "name": "stdout",
341
+ "output_type": "stream",
342
+ "text": [
343
+ " "
344
+ ]
345
+ },
346
+ {
347
+ "name": "stderr",
348
+ "output_type": "stream",
349
+ "text": [
350
+ "Loading cached processed dataset at /home/earendil/.cache/huggingface/datasets/common_voice/el-afd0a157f05ee080/6.1.0/32954a9015faa0d840f6c6894938545c5d12bc5d8936a80079af74bf50d71564/cache-55caed3924d51e22.arrow\n",
351
+ "Loading cached processed dataset at /home/earendil/.cache/huggingface/datasets/common_voice/el-afd0a157f05ee080/6.1.0/32954a9015faa0d840f6c6894938545c5d12bc5d8936a80079af74bf50d71564/cache-2cc086daed2595be.arrow\n"
352
+ ]
353
+ },
354
+ {
355
+ "name": "stdout",
356
+ "output_type": "stream",
357
+ "text": [
358
+ " "
359
+ ]
360
+ },
361
+ {
362
+ "name": "stderr",
363
+ "output_type": "stream",
364
+ "text": [
365
+ "Loading cached processed dataset at /home/earendil/.cache/huggingface/datasets/common_voice/el-afd0a157f05ee080/6.1.0/32954a9015faa0d840f6c6894938545c5d12bc5d8936a80079af74bf50d71564/cache-118401c99df7b83c.arrow\n"
366
+ ]
367
+ }
368
+ ],
369
  "source": [
370
  "common_voice_test = common_voice_test.map(prepare_dataset, remove_columns=common_voice_test.column_names, batch_size=8, num_proc=8, batched=True)"
371
  ]
372
  },
373
  {
374
  "cell_type": "code",
375
+ "execution_count": 24,
376
  "metadata": {
377
  "ExecuteTime": {
378
+ "end_time": "2021-03-14T18:07:29.428864Z",
379
+ "start_time": "2021-03-14T18:07:27.056686Z"
380
  }
381
  },
382
+ "outputs": [
383
+ {
384
+ "name": "stderr",
385
+ "output_type": "stream",
386
+ "text": [
387
+ "Using custom data configuration el-ac779bf2c9f7c09b\n",
388
+ "Reusing dataset common_voice (/home/earendil/.cache/huggingface/datasets/common_voice/el-ac779bf2c9f7c09b/6.1.0/32954a9015faa0d840f6c6894938545c5d12bc5d8936a80079af74bf50d71564)\n"
389
+ ]
390
+ }
391
+ ],
392
  "source": [
393
  "common_voice_test_transcription = load_dataset(\"common_voice\", \"el\", data_dir=\"./cv-corpus-6.1-2020-12-11\", split=\"test\")"
394
  ]
395
  },
396
  {
397
  "cell_type": "code",
398
+ "execution_count": 25,
399
  "metadata": {
400
  "ExecuteTime": {
401
+ "end_time": "2021-03-14T18:07:54.722520Z",
402
+ "start_time": "2021-03-14T18:07:29.451275Z"
403
  }
404
  },
405
  "outputs": [],
406
  "source": [
407
  "# Change this value to try inference on different CommonVoice extracts\n",
408
+ "example = 678\n",
409
  "\n",
410
  "input_dict = processor(common_voice_test[\"input_values\"][example], return_tensors=\"pt\", sampling_rate=16_000, padding=True)\n",
411
  "\n",
 
416
  },
417
  {
418
  "cell_type": "code",
419
+ "execution_count": 26,
420
  "metadata": {
421
  "ExecuteTime": {
422
+ "end_time": "2021-03-14T18:07:54.742988Z",
423
+ "start_time": "2021-03-14T18:07:54.739626Z"
424
  }
425
  },
426
+ "outputs": [
427
+ {
428
+ "name": "stdout",
429
+ "output_type": "stream",
430
+ "text": [
431
+ "Prediction:\n",
432
+ "πού θέλεις να πάμε ρώτησε φοβισμένα ο βασιλιάς\n",
433
+ "\n",
434
+ "Reference:\n",
435
+ "πού θέλεις να πάμε; ρώτησε φοβισμένα ο βασιλιάς.\n"
436
+ ]
437
+ }
438
+ ],
439
  "source": [
440
  "print(\"Prediction:\")\n",
441
  "print(processor.decode(pred_ids[0]))\n",
442
+ "# πού θέλεις να πάμε ρώτησε φοβισμένα ο βασιλιάς\n",
443
  "\n",
444
  "print(\"\\nReference:\")\n",
445
  "print(common_voice_test_transcription[\"sentence\"][example].lower())\n",
446
+ "# πού θέλεις να πάμε; ρώτησε φοβισμένα ο βασιλιάς."
447
  ]
 
 
 
 
 
 
 
448
  }
449
  ],
450
  "metadata": {
Fine_Tune_XLSR_Wav2Vec2_on_Greek_ASR_with_🤗_Transformers.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
README.md CHANGED
@@ -15,15 +15,17 @@ tags:
15
  * language: el
16
  * licence: apache-2.0
17
  * dataset: CommonVoice (EL), 364MB: https://commonvoice.mozilla.org/el/datasets
18
- * model: XLSR-Wav2Vec2
19
  * metrics: WER
20
 
21
  ### Model description
22
 
23
- Wav2Vec2 is a pretrained model for Automatic Speech Recognition (ASR) and was released in September 2020 by Alexei Baevski, Michael Auli, and Alex Conneau. Soon after the superior performance of Wav2Vec2 was demonstrated on the English ASR dataset LibriSpeech, Facebook AI presented XLSR-Wav2Vec2 (click here). XLSR stands for cross-lingual speech representations and refers to XLSR-Wav2Vec2`s ability to learn speech representations that are useful across multiple languages.
24
 
25
  Similar to Wav2Vec2, XLSR-Wav2Vec2 learns powerful speech representations from hundreds of thousands of hours of speech in more than 50 languages of unlabeled speech. Similar, to BERT's masked language modeling, the model learns contextualized speech representations by randomly masking feature vectors before passing them to a transformer network.
26
 
 
 
27
  ### How to use for inference:
28
 
29
  Instructions to test on CommonVoice extracts are provided in the ASR_Inference.ipynb. Snippet also available below:
@@ -113,12 +115,11 @@ pred_ids = torch.argmax(logits, dim=-1)
113
 
114
  print("Prediction:")
115
  print(processor.decode(pred_ids[0]))
116
- # καμιά φορά τα έπαιρνε και έπαιζε όταν η δουλειά ήταν πιο χαλαρί
117
 
118
  print("\nReference:")
119
  print(common_voice_test_transcription["sentence"][example].lower())
120
- # καμιά φορά τα έπαιρνε και έπαιζε όταν η δουλειά ήταν πιο χαλαρή
121
-
122
 
123
  ```
124
 
@@ -131,9 +132,13 @@ Instructions and code to replicate the process are provided in the Fine_Tune_XLS
131
 
132
  | Metric | Value |
133
  | ----------- | ----------- |
134
- | Training Loss | 0.0536 |
135
- | Validation Loss | 0.61605 |
136
- | WER | 0.45049 |
 
 
 
 
137
 
138
 
139
  ### Acknowledgment
 
15
  * language: el
16
  * licence: apache-2.0
17
  * dataset: CommonVoice (EL), 364MB: https://commonvoice.mozilla.org/el/datasets
18
+ * model: XLSR-Wav2Vec2, trained for 60 epochs
19
  * metrics: WER
20
 
21
  ### Model description
22
 
23
+ Wav2Vec2 is a pretrained model for Automatic Speech Recognition (ASR) and was released in September 2020 by Alexei Baevski, Michael Auli, and Alex Conneau. Soon after the superior performance of Wav2Vec2 was demonstrated on the English ASR dataset LibriSpeech, Facebook AI presented XLSR-Wav2Vec2. XLSR stands for cross-lingual speech representations and refers to XLSR-Wav2Vec2`s ability to learn speech representations that are useful across multiple languages.
24
 
25
  Similar to Wav2Vec2, XLSR-Wav2Vec2 learns powerful speech representations from hundreds of thousands of hours of speech in more than 50 languages of unlabeled speech. Similar, to BERT's masked language modeling, the model learns contextualized speech representations by randomly masking feature vectors before passing them to a transformer network.
26
 
27
+ This model was trained on Greek CommonVoice speech data (364MB) for 30 epochs on a single NVIDIA RTX 3080, for aprox. 8hrs.
28
+
29
  ### How to use for inference:
30
 
31
  Instructions to test on CommonVoice extracts are provided in the ASR_Inference.ipynb. Snippet also available below:
 
115
 
116
  print("Prediction:")
117
  print(processor.decode(pred_ids[0]))
118
+ # πού θέλεις να πάμε ρώτησε φοβισμένα ο βασιλιάς
119
 
120
  print("\nReference:")
121
  print(common_voice_test_transcription["sentence"][example].lower())
122
+ # πού θέλεις να πάμε; ρώτησε φοβισμένα ο βασιλιάς.
 
123
 
124
  ```
125
 
 
132
 
133
  | Metric | Value |
134
  | ----------- | ----------- |
135
+ | Training Loss | 0.0287 |
136
+ | Validation Loss | 0.6162 |
137
+ | WER | 0.4287 |
138
+
139
+ Full metrics log here:
140
+ <img src="https://huggingface.co/lighteternal/wav2vec2-large-xlsr-53-greek/raw/main/logs.png" width="600"/>
141
+
142
 
143
 
144
  ### Acknowledgment
logs.png ADDED
wav2vec2-large-xlsr-greek/{checkpoint-9200 → checkpoint-18400}/config.json RENAMED
File without changes
wav2vec2-large-xlsr-greek/{checkpoint-9200 → checkpoint-18400}/optimizer.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:00f31a1f9ea99f4f201431486237cb849ddaf1df3dbde251cd982a7c835ce2af
3
- size 2524210623
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:272a2cd55e47d89c16aadfc21a5186b3c1ee4f0dd61f67d1dfbe6e325392f208
3
+ size 2490511751
wav2vec2-large-xlsr-greek/{checkpoint-9200 → checkpoint-18400}/preprocessor_config.json RENAMED
File without changes
wav2vec2-large-xlsr-greek/{checkpoint-9200 → checkpoint-18400}/pytorch_model.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:86b18f91292ad1bfdbf47078ecd8514857e1bc6e5897e3194a08f8e59bcb6f64
3
  size 1262151127
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36e6ae5d87f7ddd5c2e1ec54a82ae65bcf46df9cf2a9e2e6a7e49b6ceda40464
3
  size 1262151127
wav2vec2-large-xlsr-greek/{checkpoint-9200 → checkpoint-18400}/scheduler.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:57d74da9952c362d5a83bed5bbb6007b9c763cc15805a3d57f35fe8d7e0872a3
3
  size 623
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:191d69c70a4b492d24ee1d59dad04edf9839c09a08b9cc6bd41530cfdaa0092c
3
  size 623
wav2vec2-large-xlsr-greek/checkpoint-18400/trainer_state.json ADDED
@@ -0,0 +1,660 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 59.354838709677416,
5
+ "global_step": 18400,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 1.29,
12
+ "learning_rate": 0.00023999999999999998,
13
+ "loss": 7.1936,
14
+ "step": 400
15
+ },
16
+ {
17
+ "epoch": 1.29,
18
+ "eval_loss": 3.278684139251709,
19
+ "eval_runtime": 123.6406,
20
+ "eval_samples_per_second": 12.31,
21
+ "eval_wer": 1.0,
22
+ "step": 400
23
+ },
24
+ {
25
+ "epoch": 2.58,
26
+ "learning_rate": 0.0002950276243093923,
27
+ "loss": 3.1985,
28
+ "step": 800
29
+ },
30
+ {
31
+ "epoch": 2.58,
32
+ "eval_loss": 2.8315000534057617,
33
+ "eval_runtime": 121.9578,
34
+ "eval_samples_per_second": 12.48,
35
+ "eval_wer": 0.9966690219037044,
36
+ "step": 800
37
+ },
38
+ {
39
+ "epoch": 3.87,
40
+ "learning_rate": 0.00028839779005524857,
41
+ "loss": 1.1566,
42
+ "step": 1200
43
+ },
44
+ {
45
+ "epoch": 3.87,
46
+ "eval_loss": 0.7793326377868652,
47
+ "eval_runtime": 113.9019,
48
+ "eval_samples_per_second": 13.362,
49
+ "eval_wer": 0.7725850408801858,
50
+ "step": 1200
51
+ },
52
+ {
53
+ "epoch": 5.16,
54
+ "learning_rate": 0.00028176795580110497,
55
+ "loss": 0.5297,
56
+ "step": 1600
57
+ },
58
+ {
59
+ "epoch": 5.16,
60
+ "eval_loss": 0.6474416255950928,
61
+ "eval_runtime": 122.7887,
62
+ "eval_samples_per_second": 12.395,
63
+ "eval_wer": 0.6751791662460886,
64
+ "step": 1600
65
+ },
66
+ {
67
+ "epoch": 6.45,
68
+ "learning_rate": 0.0002751381215469613,
69
+ "loss": 0.3725,
70
+ "step": 2000
71
+ },
72
+ {
73
+ "epoch": 6.45,
74
+ "eval_loss": 0.5674485564231873,
75
+ "eval_runtime": 114.3739,
76
+ "eval_samples_per_second": 13.307,
77
+ "eval_wer": 0.6073483395578884,
78
+ "step": 2000
79
+ },
80
+ {
81
+ "epoch": 7.74,
82
+ "learning_rate": 0.00026850828729281767,
83
+ "loss": 0.2877,
84
+ "step": 2400
85
+ },
86
+ {
87
+ "epoch": 7.74,
88
+ "eval_loss": 0.5630490183830261,
89
+ "eval_runtime": 114.404,
90
+ "eval_samples_per_second": 13.304,
91
+ "eval_wer": 0.5621277884324215,
92
+ "step": 2400
93
+ },
94
+ {
95
+ "epoch": 9.03,
96
+ "learning_rate": 0.000261878453038674,
97
+ "loss": 0.2342,
98
+ "step": 2800
99
+ },
100
+ {
101
+ "epoch": 9.03,
102
+ "eval_loss": 0.5479915142059326,
103
+ "eval_runtime": 123.0266,
104
+ "eval_samples_per_second": 12.371,
105
+ "eval_wer": 0.5403250227112143,
106
+ "step": 2800
107
+ },
108
+ {
109
+ "epoch": 10.32,
110
+ "learning_rate": 0.00025524861878453036,
111
+ "loss": 0.1967,
112
+ "step": 3200
113
+ },
114
+ {
115
+ "epoch": 10.32,
116
+ "eval_loss": 0.6276335120201111,
117
+ "eval_runtime": 114.8683,
118
+ "eval_samples_per_second": 13.25,
119
+ "eval_wer": 0.5577874230342182,
120
+ "step": 3200
121
+ },
122
+ {
123
+ "epoch": 11.61,
124
+ "learning_rate": 0.0002486187845303867,
125
+ "loss": 0.1729,
126
+ "step": 3600
127
+ },
128
+ {
129
+ "epoch": 11.61,
130
+ "eval_loss": 0.536430835723877,
131
+ "eval_runtime": 114.6009,
132
+ "eval_samples_per_second": 13.281,
133
+ "eval_wer": 0.5332593115978601,
134
+ "step": 3600
135
+ },
136
+ {
137
+ "epoch": 12.9,
138
+ "learning_rate": 0.00024198895027624309,
139
+ "loss": 0.1607,
140
+ "step": 4000
141
+ },
142
+ {
143
+ "epoch": 12.9,
144
+ "eval_loss": 0.5107040405273438,
145
+ "eval_runtime": 115.3852,
146
+ "eval_samples_per_second": 13.191,
147
+ "eval_wer": 0.495306349046129,
148
+ "step": 4000
149
+ },
150
+ {
151
+ "epoch": 14.19,
152
+ "learning_rate": 0.00023535911602209943,
153
+ "loss": 0.1381,
154
+ "step": 4400
155
+ },
156
+ {
157
+ "epoch": 14.19,
158
+ "eval_loss": 0.5929794907569885,
159
+ "eval_runtime": 123.4194,
160
+ "eval_samples_per_second": 12.332,
161
+ "eval_wer": 0.5220551125466841,
162
+ "step": 4400
163
+ },
164
+ {
165
+ "epoch": 15.48,
166
+ "learning_rate": 0.0002287292817679558,
167
+ "loss": 0.1261,
168
+ "step": 4800
169
+ },
170
+ {
171
+ "epoch": 15.48,
172
+ "eval_loss": 0.6632552742958069,
173
+ "eval_runtime": 124.1096,
174
+ "eval_samples_per_second": 12.263,
175
+ "eval_wer": 0.5305339658827092,
176
+ "step": 4800
177
+ },
178
+ {
179
+ "epoch": 16.77,
180
+ "learning_rate": 0.00022209944751381213,
181
+ "loss": 0.1216,
182
+ "step": 5200
183
+ },
184
+ {
185
+ "epoch": 16.77,
186
+ "eval_loss": 0.6114311218261719,
187
+ "eval_runtime": 116.8858,
188
+ "eval_samples_per_second": 13.021,
189
+ "eval_wer": 0.5084283839709296,
190
+ "step": 5200
191
+ },
192
+ {
193
+ "epoch": 18.06,
194
+ "learning_rate": 0.0002154696132596685,
195
+ "loss": 0.1119,
196
+ "step": 5600
197
+ },
198
+ {
199
+ "epoch": 18.06,
200
+ "eval_loss": 0.6091165542602539,
201
+ "eval_runtime": 123.4244,
202
+ "eval_samples_per_second": 12.331,
203
+ "eval_wer": 0.5041889573029171,
204
+ "step": 5600
205
+ },
206
+ {
207
+ "epoch": 19.35,
208
+ "learning_rate": 0.00020883977900552485,
209
+ "loss": 0.107,
210
+ "step": 6000
211
+ },
212
+ {
213
+ "epoch": 19.35,
214
+ "eval_loss": 0.5811592936515808,
215
+ "eval_runtime": 131.8655,
216
+ "eval_samples_per_second": 11.542,
217
+ "eval_wer": 0.49581104269708287,
218
+ "step": 6000
219
+ },
220
+ {
221
+ "epoch": 20.65,
222
+ "learning_rate": 0.00020220994475138123,
223
+ "loss": 0.0985,
224
+ "step": 6400
225
+ },
226
+ {
227
+ "epoch": 20.65,
228
+ "eval_loss": 0.6538776755332947,
229
+ "eval_runtime": 132.6579,
230
+ "eval_samples_per_second": 11.473,
231
+ "eval_wer": 0.5082265065105481,
232
+ "step": 6400
233
+ },
234
+ {
235
+ "epoch": 21.94,
236
+ "learning_rate": 0.00019558011049723755,
237
+ "loss": 0.0959,
238
+ "step": 6800
239
+ },
240
+ {
241
+ "epoch": 21.94,
242
+ "eval_loss": 0.602001965045929,
243
+ "eval_runtime": 132.2249,
244
+ "eval_samples_per_second": 11.511,
245
+ "eval_wer": 0.4906631674573534,
246
+ "step": 6800
247
+ },
248
+ {
249
+ "epoch": 23.23,
250
+ "learning_rate": 0.0001889502762430939,
251
+ "loss": 0.0899,
252
+ "step": 7200
253
+ },
254
+ {
255
+ "epoch": 23.23,
256
+ "eval_loss": 0.6006932854652405,
257
+ "eval_runtime": 115.0032,
258
+ "eval_samples_per_second": 13.234,
259
+ "eval_wer": 0.48813969920258404,
260
+ "step": 7200
261
+ },
262
+ {
263
+ "epoch": 24.52,
264
+ "learning_rate": 0.00018232044198895027,
265
+ "loss": 0.0867,
266
+ "step": 7600
267
+ },
268
+ {
269
+ "epoch": 24.52,
270
+ "eval_loss": 0.5921047329902649,
271
+ "eval_runtime": 115.0629,
272
+ "eval_samples_per_second": 13.228,
273
+ "eval_wer": 0.48763500555163014,
274
+ "step": 7600
275
+ },
276
+ {
277
+ "epoch": 25.81,
278
+ "learning_rate": 0.0001756906077348066,
279
+ "loss": 0.0825,
280
+ "step": 8000
281
+ },
282
+ {
283
+ "epoch": 25.81,
284
+ "eval_loss": 0.6151732802391052,
285
+ "eval_runtime": 115.0579,
286
+ "eval_samples_per_second": 13.228,
287
+ "eval_wer": 0.4937922680932674,
288
+ "step": 8000
289
+ },
290
+ {
291
+ "epoch": 27.1,
292
+ "learning_rate": 0.00016906077348066297,
293
+ "loss": 0.0768,
294
+ "step": 8400
295
+ },
296
+ {
297
+ "epoch": 27.1,
298
+ "eval_loss": 0.6089133024215698,
299
+ "eval_runtime": 123.8125,
300
+ "eval_samples_per_second": 12.293,
301
+ "eval_wer": 0.47723831634198044,
302
+ "step": 8400
303
+ },
304
+ {
305
+ "epoch": 28.39,
306
+ "learning_rate": 0.00016243093922651931,
307
+ "loss": 0.0698,
308
+ "step": 8800
309
+ },
310
+ {
311
+ "epoch": 28.39,
312
+ "eval_loss": 0.6180127263069153,
313
+ "eval_runtime": 123.6525,
314
+ "eval_samples_per_second": 12.309,
315
+ "eval_wer": 0.47330170586454023,
316
+ "step": 8800
317
+ },
318
+ {
319
+ "epoch": 29.68,
320
+ "learning_rate": 0.0001558011049723757,
321
+ "loss": 0.0753,
322
+ "step": 9200
323
+ },
324
+ {
325
+ "epoch": 29.68,
326
+ "eval_loss": 0.621383547782898,
327
+ "eval_runtime": 115.3106,
328
+ "eval_samples_per_second": 13.199,
329
+ "eval_wer": 0.4866256182497224,
330
+ "step": 9200
331
+ },
332
+ {
333
+ "epoch": 30.97,
334
+ "learning_rate": 0.00014917127071823204,
335
+ "loss": 0.0674,
336
+ "step": 9600
337
+ },
338
+ {
339
+ "epoch": 30.97,
340
+ "eval_loss": 0.6383547782897949,
341
+ "eval_runtime": 123.6323,
342
+ "eval_samples_per_second": 12.311,
343
+ "eval_wer": 0.4813768042798022,
344
+ "step": 9600
345
+ },
346
+ {
347
+ "epoch": 32.26,
348
+ "learning_rate": 0.00014254143646408839,
349
+ "loss": 0.0617,
350
+ "step": 10000
351
+ },
352
+ {
353
+ "epoch": 32.26,
354
+ "eval_loss": 0.6684080362319946,
355
+ "eval_runtime": 122.6948,
356
+ "eval_samples_per_second": 12.405,
357
+ "eval_wer": 0.47673362269102654,
358
+ "step": 10000
359
+ },
360
+ {
361
+ "epoch": 33.55,
362
+ "learning_rate": 0.00013591160220994473,
363
+ "loss": 0.059,
364
+ "step": 10400
365
+ },
366
+ {
367
+ "epoch": 33.55,
368
+ "eval_loss": 0.6545931696891785,
369
+ "eval_runtime": 115.3877,
370
+ "eval_samples_per_second": 13.19,
371
+ "eval_wer": 0.46734632078328453,
372
+ "step": 10400
373
+ },
374
+ {
375
+ "epoch": 34.84,
376
+ "learning_rate": 0.0001292817679558011,
377
+ "loss": 0.0598,
378
+ "step": 10800
379
+ },
380
+ {
381
+ "epoch": 34.84,
382
+ "eval_loss": 0.6368861794471741,
383
+ "eval_runtime": 115.1488,
384
+ "eval_samples_per_second": 13.218,
385
+ "eval_wer": 0.4668416271323307,
386
+ "step": 10800
387
+ },
388
+ {
389
+ "epoch": 36.13,
390
+ "learning_rate": 0.00012265193370165746,
391
+ "loss": 0.0558,
392
+ "step": 11200
393
+ },
394
+ {
395
+ "epoch": 36.13,
396
+ "eval_loss": 0.6463531851768494,
397
+ "eval_runtime": 115.1323,
398
+ "eval_samples_per_second": 13.22,
399
+ "eval_wer": 0.4656303623700414,
400
+ "step": 11200
401
+ },
402
+ {
403
+ "epoch": 37.42,
404
+ "learning_rate": 0.0001160220994475138,
405
+ "loss": 0.0568,
406
+ "step": 11600
407
+ },
408
+ {
409
+ "epoch": 37.42,
410
+ "eval_loss": 0.6061355471611023,
411
+ "eval_runtime": 115.4845,
412
+ "eval_samples_per_second": 13.179,
413
+ "eval_wer": 0.46966791157767235,
414
+ "step": 11600
415
+ },
416
+ {
417
+ "epoch": 38.71,
418
+ "learning_rate": 0.00010939226519337017,
419
+ "loss": 0.05,
420
+ "step": 12000
421
+ },
422
+ {
423
+ "epoch": 38.71,
424
+ "eval_loss": 0.6069867014884949,
425
+ "eval_runtime": 115.3997,
426
+ "eval_samples_per_second": 13.189,
427
+ "eval_wer": 0.46381346522660744,
428
+ "step": 12000
429
+ },
430
+ {
431
+ "epoch": 40.0,
432
+ "learning_rate": 0.00010276243093922651,
433
+ "loss": 0.0489,
434
+ "step": 12400
435
+ },
436
+ {
437
+ "epoch": 40.0,
438
+ "eval_loss": 0.6435591578483582,
439
+ "eval_runtime": 115.2889,
440
+ "eval_samples_per_second": 13.202,
441
+ "eval_wer": 0.45493085696981933,
442
+ "step": 12400
443
+ },
444
+ {
445
+ "epoch": 41.29,
446
+ "learning_rate": 9.613259668508287e-05,
447
+ "loss": 0.0508,
448
+ "step": 12800
449
+ },
450
+ {
451
+ "epoch": 41.29,
452
+ "eval_loss": 0.6377198696136475,
453
+ "eval_runtime": 115.7131,
454
+ "eval_samples_per_second": 13.153,
455
+ "eval_wer": 0.45493085696981933,
456
+ "step": 12800
457
+ },
458
+ {
459
+ "epoch": 42.58,
460
+ "learning_rate": 8.950276243093922e-05,
461
+ "loss": 0.0475,
462
+ "step": 13200
463
+ },
464
+ {
465
+ "epoch": 42.58,
466
+ "eval_loss": 0.6290169954299927,
467
+ "eval_runtime": 115.5463,
468
+ "eval_samples_per_second": 13.172,
469
+ "eval_wer": 0.45836277379630563,
470
+ "step": 13200
471
+ },
472
+ {
473
+ "epoch": 43.87,
474
+ "learning_rate": 8.287292817679558e-05,
475
+ "loss": 0.0415,
476
+ "step": 13600
477
+ },
478
+ {
479
+ "epoch": 43.87,
480
+ "eval_loss": 0.6694880723953247,
481
+ "eval_runtime": 115.6201,
482
+ "eval_samples_per_second": 13.164,
483
+ "eval_wer": 0.45291208236600383,
484
+ "step": 13600
485
+ },
486
+ {
487
+ "epoch": 45.16,
488
+ "learning_rate": 7.624309392265193e-05,
489
+ "loss": 0.0437,
490
+ "step": 14000
491
+ },
492
+ {
493
+ "epoch": 45.16,
494
+ "eval_loss": 0.5929790139198303,
495
+ "eval_runtime": 115.6043,
496
+ "eval_samples_per_second": 13.166,
497
+ "eval_wer": 0.44594730998284043,
498
+ "step": 14000
499
+ },
500
+ {
501
+ "epoch": 46.45,
502
+ "learning_rate": 6.961325966850828e-05,
503
+ "loss": 0.0405,
504
+ "step": 14400
505
+ },
506
+ {
507
+ "epoch": 46.45,
508
+ "eval_loss": 0.5978022813796997,
509
+ "eval_runtime": 115.5674,
510
+ "eval_samples_per_second": 13.17,
511
+ "eval_wer": 0.44857171696780057,
512
+ "step": 14400
513
+ },
514
+ {
515
+ "epoch": 47.74,
516
+ "learning_rate": 6.298342541436463e-05,
517
+ "loss": 0.0373,
518
+ "step": 14800
519
+ },
520
+ {
521
+ "epoch": 47.74,
522
+ "eval_loss": 0.6167137622833252,
523
+ "eval_runtime": 129.2696,
524
+ "eval_samples_per_second": 11.774,
525
+ "eval_wer": 0.4487735944281821,
526
+ "step": 14800
527
+ },
528
+ {
529
+ "epoch": 49.03,
530
+ "learning_rate": 5.635359116022099e-05,
531
+ "loss": 0.0351,
532
+ "step": 15200
533
+ },
534
+ {
535
+ "epoch": 49.03,
536
+ "eval_loss": 0.6160412430763245,
537
+ "eval_runtime": 132.6569,
538
+ "eval_samples_per_second": 11.473,
539
+ "eval_wer": 0.4417078833148279,
540
+ "step": 15200
541
+ },
542
+ {
543
+ "epoch": 50.32,
544
+ "learning_rate": 4.9723756906077343e-05,
545
+ "loss": 0.0373,
546
+ "step": 15600
547
+ },
548
+ {
549
+ "epoch": 50.32,
550
+ "eval_loss": 0.5868551731109619,
551
+ "eval_runtime": 116.8754,
552
+ "eval_samples_per_second": 13.022,
553
+ "eval_wer": 0.4385787826789139,
554
+ "step": 15600
555
+ },
556
+ {
557
+ "epoch": 51.61,
558
+ "learning_rate": 4.30939226519337e-05,
559
+ "loss": 0.0346,
560
+ "step": 16000
561
+ },
562
+ {
563
+ "epoch": 51.61,
564
+ "eval_loss": 0.6198846101760864,
565
+ "eval_runtime": 124.1832,
566
+ "eval_samples_per_second": 12.256,
567
+ "eval_wer": 0.4357524982335722,
568
+ "step": 16000
569
+ },
570
+ {
571
+ "epoch": 52.9,
572
+ "learning_rate": 3.646408839779005e-05,
573
+ "loss": 0.0328,
574
+ "step": 16400
575
+ },
576
+ {
577
+ "epoch": 52.9,
578
+ "eval_loss": 0.6003885269165039,
579
+ "eval_runtime": 124.3773,
580
+ "eval_samples_per_second": 12.237,
581
+ "eval_wer": 0.43999192490158473,
582
+ "step": 16400
583
+ },
584
+ {
585
+ "epoch": 54.19,
586
+ "learning_rate": 2.9834254143646404e-05,
587
+ "loss": 0.0315,
588
+ "step": 16800
589
+ },
590
+ {
591
+ "epoch": 54.19,
592
+ "eval_loss": 0.6082204580307007,
593
+ "eval_runtime": 124.1939,
594
+ "eval_samples_per_second": 12.255,
595
+ "eval_wer": 0.4374684566468154,
596
+ "step": 16800
597
+ },
598
+ {
599
+ "epoch": 55.48,
600
+ "learning_rate": 2.320441988950276e-05,
601
+ "loss": 0.0312,
602
+ "step": 17200
603
+ },
604
+ {
605
+ "epoch": 55.48,
606
+ "eval_loss": 0.6157354116439819,
607
+ "eval_runtime": 124.1732,
608
+ "eval_samples_per_second": 12.257,
609
+ "eval_wer": 0.4346421722014737,
610
+ "step": 17200
611
+ },
612
+ {
613
+ "epoch": 56.77,
614
+ "learning_rate": 1.6574585635359113e-05,
615
+ "loss": 0.031,
616
+ "step": 17600
617
+ },
618
+ {
619
+ "epoch": 56.77,
620
+ "eval_loss": 0.6112752556800842,
621
+ "eval_runtime": 125.7635,
622
+ "eval_samples_per_second": 12.102,
623
+ "eval_wer": 0.4303018068032704,
624
+ "step": 17600
625
+ },
626
+ {
627
+ "epoch": 58.06,
628
+ "learning_rate": 9.944751381215468e-06,
629
+ "loss": 0.0296,
630
+ "step": 18000
631
+ },
632
+ {
633
+ "epoch": 58.06,
634
+ "eval_loss": 0.6031031608581543,
635
+ "eval_runtime": 125.68,
636
+ "eval_samples_per_second": 12.11,
637
+ "eval_wer": 0.4288886645805996,
638
+ "step": 18000
639
+ },
640
+ {
641
+ "epoch": 59.35,
642
+ "learning_rate": 3.3149171270718227e-06,
643
+ "loss": 0.0287,
644
+ "step": 18400
645
+ },
646
+ {
647
+ "epoch": 59.35,
648
+ "eval_loss": 0.6061920523643494,
649
+ "eval_runtime": 124.5102,
650
+ "eval_samples_per_second": 12.224,
651
+ "eval_wer": 0.428686787120218,
652
+ "step": 18400
653
+ }
654
+ ],
655
+ "max_steps": 18600,
656
+ "num_train_epochs": 60,
657
+ "total_flos": 2.8159583015375258e+19,
658
+ "trial_name": null,
659
+ "trial_params": null
660
+ }
wav2vec2-large-xlsr-greek/{checkpoint-9200 → checkpoint-18400}/training_args.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:19745f202d764bbd63c72bfc82b74cad7c731fae95861ba3eaa8aae2d3c48a33
3
  size 2287
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea26d4ea8a1132b678b84593db9dedbab805a9db1b8a44ebab0fa258acec5bb1
3
  size 2287
wav2vec2-large-xlsr-greek/checkpoint-9200/trainer_state.json DELETED
@@ -1,338 +0,0 @@
1
- {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 29.677419354838708,
5
- "global_step": 9200,
6
- "is_hyper_param_search": false,
7
- "is_local_process_zero": true,
8
- "is_world_process_zero": true,
9
- "log_history": [
10
- {
11
- "epoch": 1.29,
12
- "learning_rate": 0.00023999999999999998,
13
- "loss": 7.3425,
14
- "step": 400
15
- },
16
- {
17
- "epoch": 1.29,
18
- "eval_loss": 3.277569532394409,
19
- "eval_runtime": 138.4344,
20
- "eval_samples_per_second": 10.994,
21
- "eval_wer": 1.0,
22
- "step": 400
23
- },
24
- {
25
- "epoch": 2.58,
26
- "learning_rate": 0.0002897727272727273,
27
- "loss": 2.6649,
28
- "step": 800
29
- },
30
- {
31
- "epoch": 2.58,
32
- "eval_loss": 1.1241918802261353,
33
- "eval_runtime": 115.4749,
34
- "eval_samples_per_second": 13.18,
35
- "eval_wer": 0.8526294539214697,
36
- "step": 800
37
- },
38
- {
39
- "epoch": 3.87,
40
- "learning_rate": 0.0002761363636363636,
41
- "loss": 0.7589,
42
- "step": 1200
43
- },
44
- {
45
- "epoch": 3.87,
46
- "eval_loss": 0.69745272397995,
47
- "eval_runtime": 137.7134,
48
- "eval_samples_per_second": 11.052,
49
- "eval_wer": 0.6960734833955788,
50
- "step": 1200
51
- },
52
- {
53
- "epoch": 5.16,
54
- "learning_rate": 0.0002625,
55
- "loss": 0.476,
56
- "step": 1600
57
- },
58
- {
59
- "epoch": 5.16,
60
- "eval_loss": 0.6482455730438232,
61
- "eval_runtime": 115.5054,
62
- "eval_samples_per_second": 13.177,
63
- "eval_wer": 0.6553951751286968,
64
- "step": 1600
65
- },
66
- {
67
- "epoch": 6.45,
68
- "learning_rate": 0.00024886363636363637,
69
- "loss": 0.3479,
70
- "step": 2000
71
- },
72
- {
73
- "epoch": 6.45,
74
- "eval_loss": 0.6039410829544067,
75
- "eval_runtime": 115.7857,
76
- "eval_samples_per_second": 13.145,
77
- "eval_wer": 0.5894821843141214,
78
- "step": 2000
79
- },
80
- {
81
- "epoch": 7.74,
82
- "learning_rate": 0.0002352272727272727,
83
- "loss": 0.2783,
84
- "step": 2400
85
- },
86
- {
87
- "epoch": 7.74,
88
- "eval_loss": 0.6183858513832092,
89
- "eval_runtime": 115.9363,
90
- "eval_samples_per_second": 13.128,
91
- "eval_wer": 0.5855455738366812,
92
- "step": 2400
93
- },
94
- {
95
- "epoch": 9.03,
96
- "learning_rate": 0.00022159090909090908,
97
- "loss": 0.2204,
98
- "step": 2800
99
- },
100
- {
101
- "epoch": 9.03,
102
- "eval_loss": 0.5993764996528625,
103
- "eval_runtime": 137.9021,
104
- "eval_samples_per_second": 11.037,
105
- "eval_wer": 0.550721711920864,
106
- "step": 2800
107
- },
108
- {
109
- "epoch": 10.32,
110
- "learning_rate": 0.00020795454545454546,
111
- "loss": 0.1881,
112
- "step": 3200
113
- },
114
- {
115
- "epoch": 10.32,
116
- "eval_loss": 0.5736687779426575,
117
- "eval_runtime": 138.6384,
118
- "eval_samples_per_second": 10.978,
119
- "eval_wer": 0.537801554456445,
120
- "step": 3200
121
- },
122
- {
123
- "epoch": 11.61,
124
- "learning_rate": 0.00019431818181818179,
125
- "loss": 0.1639,
126
- "step": 3600
127
- },
128
- {
129
- "epoch": 11.61,
130
- "eval_loss": 0.5788838267326355,
131
- "eval_runtime": 139.0007,
132
- "eval_samples_per_second": 10.95,
133
- "eval_wer": 0.5217522963561119,
134
- "step": 3600
135
- },
136
- {
137
- "epoch": 12.9,
138
- "learning_rate": 0.00018068181818181817,
139
- "loss": 0.1464,
140
- "step": 4000
141
- },
142
- {
143
- "epoch": 12.9,
144
- "eval_loss": 0.5988554358482361,
145
- "eval_runtime": 116.0695,
146
- "eval_samples_per_second": 13.113,
147
- "eval_wer": 0.5138790754012315,
148
- "step": 4000
149
- },
150
- {
151
- "epoch": 14.19,
152
- "learning_rate": 0.00016704545454545452,
153
- "loss": 0.1222,
154
- "step": 4400
155
- },
156
- {
157
- "epoch": 14.19,
158
- "eval_loss": 0.6150318384170532,
159
- "eval_runtime": 140.7397,
160
- "eval_samples_per_second": 10.814,
161
- "eval_wer": 0.5012617341273847,
162
- "step": 4400
163
- },
164
- {
165
- "epoch": 15.48,
166
- "learning_rate": 0.0001534090909090909,
167
- "loss": 0.112,
168
- "step": 4800
169
- },
170
- {
171
- "epoch": 15.48,
172
- "eval_loss": 0.6471191644668579,
173
- "eval_runtime": 138.3313,
174
- "eval_samples_per_second": 11.003,
175
- "eval_wer": 0.5148884627031391,
176
- "step": 4800
177
- },
178
- {
179
- "epoch": 16.77,
180
- "learning_rate": 0.00013977272727272726,
181
- "loss": 0.105,
182
- "step": 5200
183
- },
184
- {
185
- "epoch": 16.77,
186
- "eval_loss": 0.6167843341827393,
187
- "eval_runtime": 116.3058,
188
- "eval_samples_per_second": 13.086,
189
- "eval_wer": 0.49237912587059657,
190
- "step": 5200
191
- },
192
- {
193
- "epoch": 18.06,
194
- "learning_rate": 0.00012613636363636364,
195
- "loss": 0.0936,
196
- "step": 5600
197
- },
198
- {
199
- "epoch": 18.06,
200
- "eval_loss": 0.6400735378265381,
201
- "eval_runtime": 138.5841,
202
- "eval_samples_per_second": 10.983,
203
- "eval_wer": 0.4856162309478147,
204
- "step": 5600
205
- },
206
- {
207
- "epoch": 19.35,
208
- "learning_rate": 0.0001125,
209
- "loss": 0.0849,
210
- "step": 6000
211
- },
212
- {
213
- "epoch": 19.35,
214
- "eval_loss": 0.6149299740791321,
215
- "eval_runtime": 116.9081,
216
- "eval_samples_per_second": 13.019,
217
- "eval_wer": 0.47875239729484204,
218
- "step": 6000
219
- },
220
- {
221
- "epoch": 20.65,
222
- "learning_rate": 9.886363636363635e-05,
223
- "loss": 0.079,
224
- "step": 6400
225
- },
226
- {
227
- "epoch": 20.65,
228
- "eval_loss": 0.6410804986953735,
229
- "eval_runtime": 139.105,
230
- "eval_samples_per_second": 10.941,
231
- "eval_wer": 0.48067023316846674,
232
- "step": 6400
233
- },
234
- {
235
- "epoch": 21.94,
236
- "learning_rate": 8.522727272727273e-05,
237
- "loss": 0.0752,
238
- "step": 6800
239
- },
240
- {
241
- "epoch": 21.94,
242
- "eval_loss": 0.6123934388160706,
243
- "eval_runtime": 138.8056,
244
- "eval_samples_per_second": 10.965,
245
- "eval_wer": 0.4773392550721712,
246
- "step": 6800
247
- },
248
- {
249
- "epoch": 23.23,
250
- "learning_rate": 7.159090909090909e-05,
251
- "loss": 0.0688,
252
- "step": 7200
253
- },
254
- {
255
- "epoch": 23.23,
256
- "eval_loss": 0.6324551105499268,
257
- "eval_runtime": 137.9116,
258
- "eval_samples_per_second": 11.036,
259
- "eval_wer": 0.47198950237206017,
260
- "step": 7200
261
- },
262
- {
263
- "epoch": 24.52,
264
- "learning_rate": 5.795454545454545e-05,
265
- "loss": 0.0659,
266
- "step": 7600
267
- },
268
- {
269
- "epoch": 24.52,
270
- "eval_loss": 0.6281149387359619,
271
- "eval_runtime": 139.9793,
272
- "eval_samples_per_second": 10.873,
273
- "eval_wer": 0.464923791258706,
274
- "step": 7600
275
- },
276
- {
277
- "epoch": 25.81,
278
- "learning_rate": 4.431818181818182e-05,
279
- "loss": 0.0582,
280
- "step": 8000
281
- },
282
- {
283
- "epoch": 25.81,
284
- "eval_loss": 0.6326279640197754,
285
- "eval_runtime": 139.8038,
286
- "eval_samples_per_second": 10.887,
287
- "eval_wer": 0.4604824871303119,
288
- "step": 8000
289
- },
290
- {
291
- "epoch": 27.1,
292
- "learning_rate": 3.068181818181818e-05,
293
- "loss": 0.0551,
294
- "step": 8400
295
- },
296
- {
297
- "epoch": 27.1,
298
- "eval_loss": 0.6271815299987793,
299
- "eval_runtime": 139.3604,
300
- "eval_samples_per_second": 10.921,
301
- "eval_wer": 0.46300595538508127,
302
- "step": 8400
303
- },
304
- {
305
- "epoch": 28.39,
306
- "learning_rate": 1.7045454545454543e-05,
307
- "loss": 0.0508,
308
- "step": 8800
309
- },
310
- {
311
- "epoch": 28.39,
312
- "eval_loss": 0.6332981586456299,
313
- "eval_runtime": 139.0366,
314
- "eval_samples_per_second": 10.947,
315
- "eval_wer": 0.4594730998284042,
316
- "step": 8800
317
- },
318
- {
319
- "epoch": 29.68,
320
- "learning_rate": 3.4090909090909087e-06,
321
- "loss": 0.0517,
322
- "step": 9200
323
- },
324
- {
325
- "epoch": 29.68,
326
- "eval_loss": 0.6157954931259155,
327
- "eval_runtime": 140.2207,
328
- "eval_samples_per_second": 10.854,
329
- "eval_wer": 0.45119612395276065,
330
- "step": 9200
331
- }
332
- ],
333
- "max_steps": 9300,
334
- "num_train_epochs": 30,
335
- "total_flos": 1.4085584505165812e+19,
336
- "trial_name": null,
337
- "trial_params": null
338
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
wav2vec2-large-xlsr-greek/vocab.json CHANGED
@@ -1 +1 @@
1
- {"ω": 0, "τ": 1, "´": 2, "ώ": 3, "e": 4, "h": 5, "n": 6, "ς": 7, "σ": 8, "ϋ": 9, "g": 10, "ΐ": 11, "́": 12, "ύ": 13, "ό": 14, "ψ": 15, "m": 16, "ά": 17, "α": 18, "ή": 19, "έ": 20, "": 21, "ο": 22, "π": 23, "λ": 24, "ζ": 25, "μ": 26, "β": 27, "γ": 28, "ν": 29, "ί": 30, "η": 31, "χ": 32, "υ": 33, "v": 34, "o": 35, "ε": 36, "«": 37, "ρ": 38, "φ": 39, "»": 41, "κ": 42, "θ": 43, "δ": 44, "'": 45, "ι": 46, "ξ": 47, "ϊ": 48, "a": 49, "r": 50, "|": 40, "[UNK]": 51, "[PAD]": 52}
 
1
+ {"ώ": 0, "γ": 1, "n": 2, "ϋ": 3, "κ": 4, "e": 5, "ξ": 6, "'": 7, "θ": 8, "": 9, "σ": 10, "η": 11, "ι": 12, "α": 13, "ε": 14, "υ": 15, "v": 16, "μ": 17, "ο": 18, "«": 19, "»": 20, "έ": 21, "ν": 22, "ά": 24, "o": 25, "ζ": 26, "β": 27, "τ": 28, "π": 29, "ή": 30, "ψ": 31, "ΐ": 32, "ό": 33, "h": 34, "ύ": 35, "ω": 36, "´": 37, "χ": 38, "ϊ": 39, "ρ": 40, "a": 41, "ς": 42, "r": 43, "g": 44, "m": 45, "λ": 46, "́": 47, "ί": 48, "φ": 49, "δ": 50, "|": 23, "[UNK]": 51, "[PAD]": 52}