smangrul commited on
Commit
1abc3e4
1 Parent(s): 51c4501

Upload train_model.ipynb

Browse files
Files changed (1) hide show
  1. train_model.ipynb +1436 -0
train_model.ipynb ADDED
@@ -0,0 +1,1436 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os\n",
10
+ "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n",
11
+ "from datasets import load_dataset, load_metric, Audio, concatenate_datasets\n"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": 2,
17
+ "metadata": {},
18
+ "outputs": [
19
+ {
20
+ "name": "stdout",
21
+ "output_type": "stream",
22
+ "text": [
23
+ "Login successful\n",
24
+ "Your token has been saved to /home/ubuntu/.huggingface/token\n",
25
+ "\u001b[1m\u001b[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.\n",
26
+ "You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default\n",
27
+ "\n",
28
+ "git config --global credential.helper store\u001b[0m\n"
29
+ ]
30
+ }
31
+ ],
32
+ "source": [
33
+ "from huggingface_hub import notebook_login\n",
34
+ "\n",
35
+ "notebook_login()\n",
36
+ "repo_name = \"smangrul/xls-r-300m-mr\"\n"
37
+ ]
38
+ },
39
+ {
40
+ "cell_type": "code",
41
+ "execution_count": 3,
42
+ "metadata": {},
43
+ "outputs": [
44
+ {
45
+ "name": "stderr",
46
+ "output_type": "stream",
47
+ "text": [
48
+ "Reusing dataset open_slr (/home/ubuntu/.cache/huggingface/datasets/open_slr/SLR64/0.0.0/e0fb9e36094eff565efe812d1aba158f6a46ce834cb9705c91d1e2d6ba78ed31)\n"
49
+ ]
50
+ },
51
+ {
52
+ "name": "stdout",
53
+ "output_type": "stream",
54
+ "text": [
55
+ "Dataset({\n",
56
+ " features: ['path', 'audio', 'sentence'],\n",
57
+ " num_rows: 1569\n",
58
+ "})\n"
59
+ ]
60
+ },
61
+ {
62
+ "name": "stderr",
63
+ "output_type": "stream",
64
+ "text": [
65
+ "Reusing dataset common_voice (/home/ubuntu/.cache/huggingface/datasets/mozilla-foundation___common_voice/mr/8.0.0/b8bc4d453193c06a43269b46cd87f075c70f152ac963b7f28f7a2760c45ec3e8)\n",
66
+ "Reusing dataset common_voice (/home/ubuntu/.cache/huggingface/datasets/mozilla-foundation___common_voice/mr/8.0.0/b8bc4d453193c06a43269b46cd87f075c70f152ac963b7f28f7a2760c45ec3e8)\n"
67
+ ]
68
+ },
69
+ {
70
+ "name": "stdout",
71
+ "output_type": "stream",
72
+ "text": [
73
+ "Dataset({\n",
74
+ " features: ['path', 'audio', 'sentence'],\n",
75
+ " num_rows: 698\n",
76
+ "})\n"
77
+ ]
78
+ }
79
+ ],
80
+ "source": [
81
+ "\n",
82
+ "openslr = load_dataset(\"openslr\", \"SLR64\", split=\"train\")\n",
83
+ "print(openslr)\n",
84
+ "\n",
85
+ "common_voice_train = load_dataset(\"mozilla-foundation/common_voice_8_0\", \"mr\", split=\"train+validation\", use_auth_token=True)\n",
86
+ "common_voice_test = load_dataset(\"mozilla-foundation/common_voice_8_0\", \"mr\", split=\"test\", use_auth_token=True)\n",
87
+ "common_voice_train = common_voice_train.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"segment\", \"up_votes\"])\n",
88
+ "common_voice_test = common_voice_test.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"segment\", \"up_votes\"])\n",
89
+ "print(common_voice_train)\n",
90
+ "\n"
91
+ ]
92
+ },
93
+ {
94
+ "cell_type": "code",
95
+ "execution_count": 4,
96
+ "metadata": {},
97
+ "outputs": [
98
+ {
99
+ "data": {
100
+ "text/plain": [
101
+ "Dataset({\n",
102
+ " features: ['path', 'audio', 'sentence'],\n",
103
+ " num_rows: 2267\n",
104
+ "})"
105
+ ]
106
+ },
107
+ "execution_count": 4,
108
+ "metadata": {},
109
+ "output_type": "execute_result"
110
+ }
111
+ ],
112
+ "source": [
113
+ "train_data = concatenate_datasets([common_voice_train, openslr])\n",
114
+ "train_data"
115
+ ]
116
+ },
117
+ {
118
+ "cell_type": "code",
119
+ "execution_count": 5,
120
+ "metadata": {},
121
+ "outputs": [],
122
+ "source": [
123
+ "import re\n",
124
+ "import unicodedata\n",
125
+ "chars_to_remove_regex = '[,?.!\\-\\;\\:\"“%‘”�—’…–\\।\\!\\\"\\,\\-\\.\\?\\:\\|\\“\\”\\–\\;\\'\\’\\‘\\॔\\u200c\\u200d]'\n",
126
+ "\n",
127
+ "def remove_special_characters(batch):\n",
128
+ " batch[\"sentence\"] = re.sub(chars_to_remove_regex, '', batch[\"sentence\"]).lower()\n",
129
+ " batch[\"sentence\"] = unicodedata.normalize(\"NFKC\", batch[\"sentence\"])\n",
130
+ " return batch"
131
+ ]
132
+ },
133
+ {
134
+ "cell_type": "code",
135
+ "execution_count": 6,
136
+ "metadata": {},
137
+ "outputs": [
138
+ {
139
+ "name": "stderr",
140
+ "output_type": "stream",
141
+ "text": [
142
+ "Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/mozilla-foundation___common_voice/mr/8.0.0/b8bc4d453193c06a43269b46cd87f075c70f152ac963b7f28f7a2760c45ec3e8/cache-86933e1c6f2c17a9.arrow\n",
143
+ "Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/mozilla-foundation___common_voice/mr/8.0.0/b8bc4d453193c06a43269b46cd87f075c70f152ac963b7f28f7a2760c45ec3e8/cache-0b71d94dfe9f8e07.arrow\n"
144
+ ]
145
+ }
146
+ ],
147
+ "source": [
148
+ "train_dataset = train_data.map(remove_special_characters)\n",
149
+ "test_dataset = common_voice_test.map(remove_special_characters)"
150
+ ]
151
+ },
152
+ {
153
+ "cell_type": "code",
154
+ "execution_count": 7,
155
+ "metadata": {},
156
+ "outputs": [],
157
+ "source": [
158
+ "def extract_all_chars(batch):\n",
159
+ " all_text = \" \".join(batch[\"sentence\"])\n",
160
+ " vocab = list(set(all_text))\n",
161
+ " return {\"vocab\": [vocab], \"all_text\": [all_text]}"
162
+ ]
163
+ },
164
+ {
165
+ "cell_type": "code",
166
+ "execution_count": 8,
167
+ "metadata": {},
168
+ "outputs": [
169
+ {
170
+ "data": {
171
+ "application/vnd.jupyter.widget-view+json": {
172
+ "model_id": "54586502931b4e99ab8e4cb90cb9fbc0",
173
+ "version_major": 2,
174
+ "version_minor": 0
175
+ },
176
+ "text/plain": [
177
+ " 0%| | 0/1 [00:00<?, ?ba/s]"
178
+ ]
179
+ },
180
+ "metadata": {},
181
+ "output_type": "display_data"
182
+ },
183
+ {
184
+ "data": {
185
+ "application/vnd.jupyter.widget-view+json": {
186
+ "model_id": "8fd87aff4e5f483daf6c6e5a4a00e37b",
187
+ "version_major": 2,
188
+ "version_minor": 0
189
+ },
190
+ "text/plain": [
191
+ " 0%| | 0/1 [00:00<?, ?ba/s]"
192
+ ]
193
+ },
194
+ "metadata": {},
195
+ "output_type": "display_data"
196
+ }
197
+ ],
198
+ "source": [
199
+ "vocab_train = train_dataset.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=train_dataset.column_names)\n",
200
+ "vocab_test = test_dataset.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=train_dataset.column_names)\n"
201
+ ]
202
+ },
203
+ {
204
+ "cell_type": "code",
205
+ "execution_count": 9,
206
+ "metadata": {},
207
+ "outputs": [
208
+ {
209
+ "data": {
210
+ "text/plain": [
211
+ "{' ': 0,\n",
212
+ " 'ँ': 1,\n",
213
+ " 'ं': 2,\n",
214
+ " 'ः': 3,\n",
215
+ " 'अ': 4,\n",
216
+ " 'आ': 5,\n",
217
+ " 'इ': 6,\n",
218
+ " 'ई': 7,\n",
219
+ " 'उ': 8,\n",
220
+ " 'ऊ': 9,\n",
221
+ " 'ऋ': 10,\n",
222
+ " 'ए': 11,\n",
223
+ " 'ऐ': 12,\n",
224
+ " 'ऑ': 13,\n",
225
+ " 'ओ': 14,\n",
226
+ " 'औ': 15,\n",
227
+ " 'क': 16,\n",
228
+ " 'ख': 17,\n",
229
+ " 'ग': 18,\n",
230
+ " 'घ': 19,\n",
231
+ " 'च': 20,\n",
232
+ " 'छ': 21,\n",
233
+ " 'ज': 22,\n",
234
+ " 'झ': 23,\n",
235
+ " 'ञ': 24,\n",
236
+ " 'ट': 25,\n",
237
+ " 'ठ': 26,\n",
238
+ " 'ड': 27,\n",
239
+ " 'ढ': 28,\n",
240
+ " 'ण': 29,\n",
241
+ " 'त': 30,\n",
242
+ " 'थ': 31,\n",
243
+ " 'द': 32,\n",
244
+ " 'ध': 33,\n",
245
+ " 'न': 34,\n",
246
+ " 'प': 35,\n",
247
+ " 'फ': 36,\n",
248
+ " 'ब': 37,\n",
249
+ " 'भ': 38,\n",
250
+ " 'म': 39,\n",
251
+ " 'य': 40,\n",
252
+ " 'र': 41,\n",
253
+ " 'ऱ': 42,\n",
254
+ " 'ल': 43,\n",
255
+ " 'ळ': 44,\n",
256
+ " 'व': 45,\n",
257
+ " 'श': 46,\n",
258
+ " 'ष': 47,\n",
259
+ " 'स': 48,\n",
260
+ " 'ह': 49,\n",
261
+ " '़': 50,\n",
262
+ " 'ा': 51,\n",
263
+ " 'ि': 52,\n",
264
+ " 'ी': 53,\n",
265
+ " 'ु': 54,\n",
266
+ " 'ू': 55,\n",
267
+ " 'ृ': 56,\n",
268
+ " 'ॄ': 57,\n",
269
+ " 'ॅ': 58,\n",
270
+ " 'े': 59,\n",
271
+ " 'ै': 60,\n",
272
+ " 'ॉ': 61,\n",
273
+ " 'ॊ': 62,\n",
274
+ " 'ो': 63,\n",
275
+ " 'ौ': 64,\n",
276
+ " '्': 65,\n",
277
+ " 'ॲ': 66}"
278
+ ]
279
+ },
280
+ "execution_count": 9,
281
+ "metadata": {},
282
+ "output_type": "execute_result"
283
+ }
284
+ ],
285
+ "source": [
286
+ "vocab_list = list(set(vocab_train[\"vocab\"][0]) | set(vocab_test[\"vocab\"][0]))\n",
287
+ "vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}\n",
288
+ "vocab_dict"
289
+ ]
290
+ },
291
+ {
292
+ "cell_type": "code",
293
+ "execution_count": 10,
294
+ "metadata": {},
295
+ "outputs": [],
296
+ "source": [
297
+ "vocab_dict[\"|\"] = vocab_dict[\" \"]\n",
298
+ "del vocab_dict[\" \"]"
299
+ ]
300
+ },
301
+ {
302
+ "cell_type": "code",
303
+ "execution_count": 11,
304
+ "metadata": {},
305
+ "outputs": [
306
+ {
307
+ "data": {
308
+ "text/plain": [
309
+ "69"
310
+ ]
311
+ },
312
+ "execution_count": 11,
313
+ "metadata": {},
314
+ "output_type": "execute_result"
315
+ }
316
+ ],
317
+ "source": [
318
+ "vocab_dict[\"[UNK]\"] = len(vocab_dict)\n",
319
+ "vocab_dict[\"[PAD]\"] = len(vocab_dict)\n",
320
+ "len(vocab_dict)"
321
+ ]
322
+ },
323
+ {
324
+ "cell_type": "code",
325
+ "execution_count": 12,
326
+ "metadata": {},
327
+ "outputs": [],
328
+ "source": [
329
+ "import json\n",
330
+ "with open('vocab.json', 'w') as vocab_file:\n",
331
+ " json.dump(vocab_dict, vocab_file)"
332
+ ]
333
+ },
334
+ {
335
+ "cell_type": "code",
336
+ "execution_count": 3,
337
+ "metadata": {},
338
+ "outputs": [
339
+ {
340
+ "name": "stderr",
341
+ "output_type": "stream",
342
+ "text": [
343
+ "file ./config.json not found\n",
344
+ "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
345
+ "To https://huggingface.co/smangrul/xls-r-300m-mr\n",
346
+ " 41422b3..c87c689 main -> main\n",
347
+ "\n"
348
+ ]
349
+ },
350
+ {
351
+ "data": {
352
+ "text/plain": [
353
+ "'https://huggingface.co/smangrul/xls-r-300m-mr/commit/c87c689895462fd42a184ae74fffebe69a4078e8'"
354
+ ]
355
+ },
356
+ "execution_count": 3,
357
+ "metadata": {},
358
+ "output_type": "execute_result"
359
+ }
360
+ ],
361
+ "source": [
362
+ "from transformers import Wav2Vec2CTCTokenizer\n",
363
+ "\n",
364
+ "tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(\"./\", unk_token=\"[UNK]\", pad_token=\"[PAD]\", word_delimiter_token=\"|\")\n",
365
+ "tokenizer.push_to_hub(repo_name)"
366
+ ]
367
+ },
368
+ {
369
+ "cell_type": "code",
370
+ "execution_count": 4,
371
+ "metadata": {},
372
+ "outputs": [],
373
+ "source": [
374
+ "from transformers import Wav2Vec2FeatureExtractor\n",
375
+ "\n",
376
+ "feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)"
377
+ ]
378
+ },
379
+ {
380
+ "cell_type": "code",
381
+ "execution_count": 5,
382
+ "metadata": {},
383
+ "outputs": [],
384
+ "source": [
385
+ "from transformers import Wav2Vec2Processor\n",
386
+ "\n",
387
+ "processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)"
388
+ ]
389
+ },
390
+ {
391
+ "cell_type": "code",
392
+ "execution_count": 16,
393
+ "metadata": {},
394
+ "outputs": [],
395
+ "source": [
396
+ "train_dataset = train_dataset.cast_column(\"audio\", Audio(sampling_rate=16_000))\n",
397
+ "test_dataset = test_dataset.cast_column(\"audio\", Audio(sampling_rate=16_000))"
398
+ ]
399
+ },
400
+ {
401
+ "cell_type": "code",
402
+ "execution_count": 17,
403
+ "metadata": {},
404
+ "outputs": [],
405
+ "source": [
406
+ "def prepare_dataset(batch):\n",
407
+ " audio = batch[\"audio\"]\n",
408
+ "\n",
409
+ " # batched output is \"un-batched\"\n",
410
+ " batch[\"input_values\"] = processor(audio[\"array\"], sampling_rate=audio[\"sampling_rate\"]).input_values[0]\n",
411
+ " batch[\"input_length\"] = len(batch[\"input_values\"])\n",
412
+ " \n",
413
+ " with processor.as_target_processor():\n",
414
+ " batch[\"labels\"] = processor(batch[\"sentence\"]).input_ids\n",
415
+ " return batch"
416
+ ]
417
+ },
418
+ {
419
+ "cell_type": "code",
420
+ "execution_count": 18,
421
+ "metadata": {},
422
+ "outputs": [
423
+ {
424
+ "data": {
425
+ "application/vnd.jupyter.widget-view+json": {
426
+ "model_id": "a096ebabad914b1f964e3a88f7763913",
427
+ "version_major": 2,
428
+ "version_minor": 0
429
+ },
430
+ "text/plain": [
431
+ " 0%| | 0/2267 [00:00<?, ?ex/s]"
432
+ ]
433
+ },
434
+ "metadata": {},
435
+ "output_type": "display_data"
436
+ },
437
+ {
438
+ "data": {
439
+ "application/vnd.jupyter.widget-view+json": {
440
+ "model_id": "a35a844a29f748cb9dc8c96c9576cfd6",
441
+ "version_major": 2,
442
+ "version_minor": 0
443
+ },
444
+ "text/plain": [
445
+ " 0%| | 0/306 [00:00<?, ?ex/s]"
446
+ ]
447
+ },
448
+ "metadata": {},
449
+ "output_type": "display_data"
450
+ }
451
+ ],
452
+ "source": [
453
+ "train_dataset = train_dataset.map(prepare_dataset, remove_columns=train_dataset.column_names)\n",
454
+ "test_dataset = test_dataset.map(prepare_dataset, remove_columns=test_dataset.column_names)"
455
+ ]
456
+ },
457
+ {
458
+ "cell_type": "code",
459
+ "execution_count": 7,
460
+ "metadata": {},
461
+ "outputs": [],
462
+ "source": [
463
+ "from datasets import load_from_disk\n",
464
+ "train_dataset = load_from_disk(\"./Data/train_dataset\")\n",
465
+ "test_dataset = load_from_disk(\"./Data/test_dataset\")"
466
+ ]
467
+ },
468
+ {
469
+ "cell_type": "code",
470
+ "execution_count": 8,
471
+ "metadata": {},
472
+ "outputs": [
473
+ {
474
+ "data": {
475
+ "text/plain": [
476
+ "Dataset({\n",
477
+ " features: ['input_values', 'input_length', 'labels'],\n",
478
+ " num_rows: 2267\n",
479
+ "})"
480
+ ]
481
+ },
482
+ "execution_count": 8,
483
+ "metadata": {},
484
+ "output_type": "execute_result"
485
+ }
486
+ ],
487
+ "source": [
488
+ "train_dataset"
489
+ ]
490
+ },
491
+ {
492
+ "cell_type": "code",
493
+ "execution_count": 9,
494
+ "metadata": {},
495
+ "outputs": [
496
+ {
497
+ "data": {
498
+ "text/plain": [
499
+ "Dataset({\n",
500
+ " features: ['input_values', 'input_length', 'labels'],\n",
501
+ " num_rows: 306\n",
502
+ "})"
503
+ ]
504
+ },
505
+ "execution_count": 9,
506
+ "metadata": {},
507
+ "output_type": "execute_result"
508
+ }
509
+ ],
510
+ "source": [
511
+ "test_dataset"
512
+ ]
513
+ },
514
+ {
515
+ "cell_type": "code",
516
+ "execution_count": 10,
517
+ "metadata": {},
518
+ "outputs": [],
519
+ "source": [
520
+ "import torch\n",
521
+ "\n",
522
+ "from dataclasses import dataclass, field\n",
523
+ "from typing import Any, Dict, List, Optional, Union\n",
524
+ "\n",
525
+ "@dataclass\n",
526
+ "class DataCollatorCTCWithPadding:\n",
527
+ " \"\"\"\n",
528
+ " Data collator that will dynamically pad the inputs received.\n",
529
+ " Args:\n",
530
+ " processor (:class:`~transformers.Wav2Vec2Processor`)\n",
531
+ " The processor used for proccessing the data.\n",
532
+ " padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):\n",
533
+ " Select a strategy to pad the returned sequences (according to the model's padding side and padding index)\n",
534
+ " among:\n",
535
+ " * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single\n",
536
+ " sequence if provided).\n",
537
+ " * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the\n",
538
+ " maximum acceptable input length for the model if that argument is not provided.\n",
539
+ " * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of\n",
540
+ " different lengths).\n",
541
+ " \"\"\"\n",
542
+ "\n",
543
+ " processor: Wav2Vec2Processor\n",
544
+ " padding: Union[bool, str] = True\n",
545
+ " \n",
546
+ " def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:\n",
547
+ " # split inputs and labels since they have to be of different lenghts and need\n",
548
+ " # different padding methods\n",
549
+ " input_features = [{\"input_values\": feature[\"input_values\"]} for feature in features]\n",
550
+ " label_features = [{\"input_ids\": feature[\"labels\"]} for feature in features]\n",
551
+ "\n",
552
+ " batch = self.processor.pad(\n",
553
+ " input_features,\n",
554
+ " padding=self.padding,\n",
555
+ " return_tensors=\"pt\",\n",
556
+ " )\n",
557
+ " with self.processor.as_target_processor():\n",
558
+ " labels_batch = self.processor.pad(\n",
559
+ " label_features,\n",
560
+ " padding=self.padding,\n",
561
+ " return_tensors=\"pt\",\n",
562
+ " )\n",
563
+ "\n",
564
+ " # replace padding with -100 to ignore loss correctly\n",
565
+ " labels = labels_batch[\"input_ids\"].masked_fill(labels_batch.attention_mask.ne(1), -100)\n",
566
+ "\n",
567
+ " batch[\"labels\"] = labels\n",
568
+ "\n",
569
+ " return batch"
570
+ ]
571
+ },
572
+ {
573
+ "cell_type": "code",
574
+ "execution_count": 11,
575
+ "metadata": {},
576
+ "outputs": [],
577
+ "source": [
578
+ "data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)"
579
+ ]
580
+ },
581
+ {
582
+ "cell_type": "code",
583
+ "execution_count": 12,
584
+ "metadata": {},
585
+ "outputs": [],
586
+ "source": [
587
+ "wer_metric = load_metric(\"wer\")"
588
+ ]
589
+ },
590
+ {
591
+ "cell_type": "code",
592
+ "execution_count": 13,
593
+ "metadata": {},
594
+ "outputs": [],
595
+ "source": [
596
+ "import numpy as np\n",
597
+ "def compute_metrics(pred):\n",
598
+ " pred_logits = pred.predictions\n",
599
+ " pred_ids = np.argmax(pred_logits, axis=-1)\n",
600
+ "\n",
601
+ " pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id\n",
602
+ "\n",
603
+ " pred_str = processor.batch_decode(pred_ids)\n",
604
+ " # we do not want to group tokens when computing the metrics\n",
605
+ " label_str = processor.batch_decode(pred.label_ids, group_tokens=False)\n",
606
+ "\n",
607
+ " wer = wer_metric.compute(predictions=pred_str, references=label_str)\n",
608
+ "\n",
609
+ " return {\"wer\": wer}"
610
+ ]
611
+ },
612
+ {
613
+ "cell_type": "code",
614
+ "execution_count": 14,
615
+ "metadata": {},
616
+ "outputs": [
617
+ {
618
+ "name": "stderr",
619
+ "output_type": "stream",
620
+ "text": [
621
+ "Some weights of the model checkpoint at facebook/wav2vec2-xls-r-300m were not used when initializing Wav2Vec2ForCTC: ['project_q.bias', 'project_q.weight', 'project_hid.weight', 'quantizer.weight_proj.bias', 'quantizer.codevectors', 'project_hid.bias', 'quantizer.weight_proj.weight']\n",
622
+ "- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
623
+ "- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
624
+ "Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['lm_head.weight', 'lm_head.bias']\n",
625
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
626
+ ]
627
+ }
628
+ ],
629
+ "source": [
630
+ "from transformers import Wav2Vec2ForCTC\n",
631
+ "\n",
632
+ "model = Wav2Vec2ForCTC.from_pretrained(\n",
633
+ " \"facebook/wav2vec2-xls-r-300m\", \n",
634
+ " attention_dropout=0.1,\n",
635
+ " layerdrop=0.0,\n",
636
+ " feat_proj_dropout=0.0,\n",
637
+ " mask_time_prob=0.75,\n",
638
+ " mask_time_length=10,\n",
639
+ " mask_feature_prob=0.25,\n",
640
+ " mask_feature_length=64,\n",
641
+ " ctc_loss_reduction=\"mean\", \n",
642
+ " pad_token_id=processor.tokenizer.pad_token_id,\n",
643
+ " vocab_size=len(processor.tokenizer),\n",
644
+ ")"
645
+ ]
646
+ },
647
+ {
648
+ "cell_type": "code",
649
+ "execution_count": 15,
650
+ "metadata": {},
651
+ "outputs": [
652
+ {
653
+ "name": "stderr",
654
+ "output_type": "stream",
655
+ "text": [
656
+ "/home/ubuntu/transformers/src/transformers/models/wav2vec2/modeling_wav2vec2.py:1717: FutureWarning: The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5.Please use the equivalent `freeze_feature_encoder` method instead.\n",
657
+ " FutureWarning,\n"
658
+ ]
659
+ }
660
+ ],
661
+ "source": [
662
+ "model.freeze_feature_extractor()"
663
+ ]
664
+ },
665
+ {
666
+ "cell_type": "code",
667
+ "execution_count": 16,
668
+ "metadata": {},
669
+ "outputs": [],
670
+ "source": [
671
+ "from transformers import TrainingArguments\n",
672
+ "\n",
673
+ "training_args = TrainingArguments(\n",
674
+ " output_dir=repo_name,\n",
675
+ " group_by_length=True,\n",
676
+ " per_device_train_batch_size=16,\n",
677
+ " gradient_accumulation_steps=2,\n",
678
+ " evaluation_strategy=\"steps\",\n",
679
+ " num_train_epochs=200,\n",
680
+ " gradient_checkpointing=True,\n",
681
+ " fp16=True,\n",
682
+ " save_steps=400,\n",
683
+ " eval_steps=400,\n",
684
+ " logging_steps=100,\n",
685
+ " learning_rate=1e-4,\n",
686
+ " warmup_steps=1000,\n",
687
+ " save_total_limit=1,\n",
688
+ " push_to_hub=True,\n",
689
+ ")"
690
+ ]
691
+ },
692
+ {
693
+ "cell_type": "code",
694
+ "execution_count": 17,
695
+ "metadata": {},
696
+ "outputs": [
697
+ {
698
+ "name": "stderr",
699
+ "output_type": "stream",
700
+ "text": [
701
+ "/ebs/learn/ASR/smangrul/xls-r-300m-mr is already a clone of https://huggingface.co/smangrul/xls-r-300m-mr. Make sure you pull the latest changes with `repo.git_pull()`.\n",
702
+ "Using amp half precision backend\n"
703
+ ]
704
+ }
705
+ ],
706
+ "source": [
707
+ "from transformers import Trainer\n",
708
+ "\n",
709
+ "trainer = Trainer(\n",
710
+ " model=model,\n",
711
+ " data_collator=data_collator,\n",
712
+ " args=training_args,\n",
713
+ " compute_metrics=compute_metrics,\n",
714
+ " train_dataset=train_dataset,\n",
715
+ " eval_dataset=test_dataset,\n",
716
+ " tokenizer=processor.feature_extractor,\n",
717
+ ")\n"
718
+ ]
719
+ },
720
+ {
721
+ "cell_type": "code",
722
+ "execution_count": 18,
723
+ "metadata": {},
724
+ "outputs": [
725
+ {
726
+ "name": "stderr",
727
+ "output_type": "stream",
728
+ "text": [
729
+ "The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.\n",
730
+ "/home/ubuntu/transformers/src/transformers/optimization.py:309: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use thePyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
731
+ " FutureWarning,\n",
732
+ "***** Running training *****\n",
733
+ " Num examples = 2267\n",
734
+ " Num Epochs = 200\n",
735
+ " Instantaneous batch size per device = 16\n",
736
+ " Total train batch size (w. parallel, distributed & accumulation) = 32\n",
737
+ " Gradient Accumulation steps = 2\n",
738
+ " Total optimization steps = 14200\n"
739
+ ]
740
+ },
741
+ {
742
+ "data": {
743
+ "text/html": [
744
+ "\n",
745
+ " <div>\n",
746
+ " \n",
747
+ " <progress value='14200' max='14200' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
748
+ " [14200/14200 10:40:12, Epoch 200/200]\n",
749
+ " </div>\n",
750
+ " <table border=\"1\" class=\"dataframe\">\n",
751
+ " <thead>\n",
752
+ " <tr style=\"text-align: left;\">\n",
753
+ " <th>Step</th>\n",
754
+ " <th>Training Loss</th>\n",
755
+ " <th>Validation Loss</th>\n",
756
+ " <th>Wer</th>\n",
757
+ " </tr>\n",
758
+ " </thead>\n",
759
+ " <tbody>\n",
760
+ " <tr>\n",
761
+ " <td>400</td>\n",
762
+ " <td>3.794000</td>\n",
763
+ " <td>3.532227</td>\n",
764
+ " <td>1.000000</td>\n",
765
+ " </tr>\n",
766
+ " <tr>\n",
767
+ " <td>800</td>\n",
768
+ " <td>3.362400</td>\n",
769
+ " <td>3.359044</td>\n",
770
+ " <td>1.000000</td>\n",
771
+ " </tr>\n",
772
+ " <tr>\n",
773
+ " <td>1200</td>\n",
774
+ " <td>2.293900</td>\n",
775
+ " <td>1.011279</td>\n",
776
+ " <td>0.829924</td>\n",
777
+ " </tr>\n",
778
+ " <tr>\n",
779
+ " <td>1600</td>\n",
780
+ " <td>1.233000</td>\n",
781
+ " <td>0.502743</td>\n",
782
+ " <td>0.593662</td>\n",
783
+ " </tr>\n",
784
+ " <tr>\n",
785
+ " <td>2000</td>\n",
786
+ " <td>0.962600</td>\n",
787
+ " <td>0.412519</td>\n",
788
+ " <td>0.496992</td>\n",
789
+ " </tr>\n",
790
+ " <tr>\n",
791
+ " <td>2400</td>\n",
792
+ " <td>0.831800</td>\n",
793
+ " <td>0.402903</td>\n",
794
+ " <td>0.493783</td>\n",
795
+ " </tr>\n",
796
+ " <tr>\n",
797
+ " <td>2800</td>\n",
798
+ " <td>0.737000</td>\n",
799
+ " <td>0.389773</td>\n",
800
+ " <td>0.469314</td>\n",
801
+ " </tr>\n",
802
+ " <tr>\n",
803
+ " <td>3200</td>\n",
804
+ " <td>0.677100</td>\n",
805
+ " <td>0.373987</td>\n",
806
+ " <td>0.436021</td>\n",
807
+ " </tr>\n",
808
+ " <tr>\n",
809
+ " <td>3600</td>\n",
810
+ " <td>0.634400</td>\n",
811
+ " <td>0.383823</td>\n",
812
+ " <td>0.432010</td>\n",
813
+ " </tr>\n",
814
+ " <tr>\n",
815
+ " <td>4000</td>\n",
816
+ " <td>0.586000</td>\n",
817
+ " <td>0.375610</td>\n",
818
+ " <td>0.419575</td>\n",
819
+ " </tr>\n",
820
+ " <tr>\n",
821
+ " <td>4400</td>\n",
822
+ " <td>0.561000</td>\n",
823
+ " <td>0.387891</td>\n",
824
+ " <td>0.418371</td>\n",
825
+ " </tr>\n",
826
+ " <tr>\n",
827
+ " <td>4800</td>\n",
828
+ " <td>0.518500</td>\n",
829
+ " <td>0.386357</td>\n",
830
+ " <td>0.417569</td>\n",
831
+ " </tr>\n",
832
+ " <tr>\n",
833
+ " <td>5200</td>\n",
834
+ " <td>0.515300</td>\n",
835
+ " <td>0.415069</td>\n",
836
+ " <td>0.430004</td>\n",
837
+ " </tr>\n",
838
+ " <tr>\n",
839
+ " <td>5600</td>\n",
840
+ " <td>0.478100</td>\n",
841
+ " <td>0.399211</td>\n",
842
+ " <td>0.408744</td>\n",
843
+ " </tr>\n",
844
+ " <tr>\n",
845
+ " <td>6000</td>\n",
846
+ " <td>0.468100</td>\n",
847
+ " <td>0.424542</td>\n",
848
+ " <td>0.402327</td>\n",
849
+ " </tr>\n",
850
+ " <tr>\n",
851
+ " <td>6400</td>\n",
852
+ " <td>0.439400</td>\n",
853
+ " <td>0.430979</td>\n",
854
+ " <td>0.410750</td>\n",
855
+ " </tr>\n",
856
+ " <tr>\n",
857
+ " <td>6800</td>\n",
858
+ " <td>0.429600</td>\n",
859
+ " <td>0.427700</td>\n",
860
+ " <td>0.409146</td>\n",
861
+ " </tr>\n",
862
+ " <tr>\n",
863
+ " <td>7200</td>\n",
864
+ " <td>0.400300</td>\n",
865
+ " <td>0.451111</td>\n",
866
+ " <td>0.419976</td>\n",
867
+ " </tr>\n",
868
+ " <tr>\n",
869
+ " <td>7600</td>\n",
870
+ " <td>0.395100</td>\n",
871
+ " <td>0.463446</td>\n",
872
+ " <td>0.405134</td>\n",
873
+ " </tr>\n",
874
+ " <tr>\n",
875
+ " <td>8000</td>\n",
876
+ " <td>0.381800</td>\n",
877
+ " <td>0.454752</td>\n",
878
+ " <td>0.407942</td>\n",
879
+ " </tr>\n",
880
+ " <tr>\n",
881
+ " <td>8400</td>\n",
882
+ " <td>0.371500</td>\n",
883
+ " <td>0.461547</td>\n",
884
+ " <td>0.404733</td>\n",
885
+ " </tr>\n",
886
+ " <tr>\n",
887
+ " <td>8800</td>\n",
888
+ " <td>0.362500</td>\n",
889
+ " <td>0.461543</td>\n",
890
+ " <td>0.411151</td>\n",
891
+ " </tr>\n",
892
+ " <tr>\n",
893
+ " <td>9200</td>\n",
894
+ " <td>0.338200</td>\n",
895
+ " <td>0.468299</td>\n",
896
+ " <td>0.417168</td>\n",
897
+ " </tr>\n",
898
+ " <tr>\n",
899
+ " <td>9600</td>\n",
900
+ " <td>0.338800</td>\n",
901
+ " <td>0.480989</td>\n",
902
+ " <td>0.412355</td>\n",
903
+ " </tr>\n",
904
+ " <tr>\n",
905
+ " <td>10000</td>\n",
906
+ " <td>0.317600</td>\n",
907
+ " <td>0.475700</td>\n",
908
+ " <td>0.410750</td>\n",
909
+ " </tr>\n",
910
+ " <tr>\n",
911
+ " <td>10400</td>\n",
912
+ " <td>0.315100</td>\n",
913
+ " <td>0.478920</td>\n",
914
+ " <td>0.403530</td>\n",
915
+ " </tr>\n",
916
+ " <tr>\n",
917
+ " <td>10800</td>\n",
918
+ " <td>0.296200</td>\n",
919
+ " <td>0.480600</td>\n",
920
+ " <td>0.398315</td>\n",
921
+ " </tr>\n",
922
+ " <tr>\n",
923
+ " <td>11200</td>\n",
924
+ " <td>0.299000</td>\n",
925
+ " <td>0.477083</td>\n",
926
+ " <td>0.393502</td>\n",
927
+ " </tr>\n",
928
+ " <tr>\n",
929
+ " <td>11600</td>\n",
930
+ " <td>0.290000</td>\n",
931
+ " <td>0.465646</td>\n",
932
+ " <td>0.393903</td>\n",
933
+ " </tr>\n",
934
+ " <tr>\n",
935
+ " <td>12000</td>\n",
936
+ " <td>0.290900</td>\n",
937
+ " <td>0.490041</td>\n",
938
+ " <td>0.405937</td>\n",
939
+ " </tr>\n",
940
+ " <tr>\n",
941
+ " <td>12400</td>\n",
942
+ " <td>0.275600</td>\n",
943
+ " <td>0.489354</td>\n",
944
+ " <td>0.399519</td>\n",
945
+ " </tr>\n",
946
+ " <tr>\n",
947
+ " <td>12800</td>\n",
948
+ " <td>0.272600</td>\n",
949
+ " <td>0.494580</td>\n",
950
+ " <td>0.395909</td>\n",
951
+ " </tr>\n",
952
+ " <tr>\n",
953
+ " <td>13200</td>\n",
954
+ " <td>0.265900</td>\n",
955
+ " <td>0.497918</td>\n",
956
+ " <td>0.397112</td>\n",
957
+ " </tr>\n",
958
+ " <tr>\n",
959
+ " <td>13600</td>\n",
960
+ " <td>0.266300</td>\n",
961
+ " <td>0.498627</td>\n",
962
+ " <td>0.397513</td>\n",
963
+ " </tr>\n",
964
+ " <tr>\n",
965
+ " <td>14000</td>\n",
966
+ " <td>0.259600</td>\n",
967
+ " <td>0.504610</td>\n",
968
+ " <td>0.401524</td>\n",
969
+ " </tr>\n",
970
+ " </tbody>\n",
971
+ "</table><p>"
972
+ ],
973
+ "text/plain": [
974
+ "<IPython.core.display.HTML object>"
975
+ ]
976
+ },
977
+ "metadata": {},
978
+ "output_type": "display_data"
979
+ },
980
+ {
981
+ "name": "stderr",
982
+ "output_type": "stream",
983
+ "text": [
984
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.\n",
985
+ "***** Running Evaluation *****\n",
986
+ " Num examples = 306\n",
987
+ " Batch size = 8\n",
988
+ "Saving model checkpoint to smangrul/xls-r-300m-mr/checkpoint-400\n",
989
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-400/config.json\n",
990
+ "Model weights saved in smangrul/xls-r-300m-mr/checkpoint-400/pytorch_model.bin\n",
991
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-400/preprocessor_config.json\n",
992
+ "Configuration saved in smangrul/xls-r-300m-mr/preprocessor_config.json\n",
993
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.\n",
994
+ "***** Running Evaluation *****\n",
995
+ " Num examples = 306\n",
996
+ " Batch size = 8\n",
997
+ "Saving model checkpoint to smangrul/xls-r-300m-mr/checkpoint-800\n",
998
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-800/config.json\n",
999
+ "Model weights saved in smangrul/xls-r-300m-mr/checkpoint-800/pytorch_model.bin\n",
1000
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-800/preprocessor_config.json\n",
1001
+ "Deleting older checkpoint [smangrul/xls-r-300m-mr/checkpoint-400] due to args.save_total_limit\n",
1002
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.\n",
1003
+ "***** Running Evaluation *****\n",
1004
+ " Num examples = 306\n",
1005
+ " Batch size = 8\n",
1006
+ "Saving model checkpoint to smangrul/xls-r-300m-mr/checkpoint-1200\n",
1007
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-1200/config.json\n",
1008
+ "Model weights saved in smangrul/xls-r-300m-mr/checkpoint-1200/pytorch_model.bin\n",
1009
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-1200/preprocessor_config.json\n",
1010
+ "Deleting older checkpoint [smangrul/xls-r-300m-mr/checkpoint-800] due to args.save_total_limit\n",
1011
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.\n",
1012
+ "***** Running Evaluation *****\n",
1013
+ " Num examples = 306\n",
1014
+ " Batch size = 8\n",
1015
+ "Saving model checkpoint to smangrul/xls-r-300m-mr/checkpoint-1600\n",
1016
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-1600/config.json\n",
1017
+ "Model weights saved in smangrul/xls-r-300m-mr/checkpoint-1600/pytorch_model.bin\n",
1018
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-1600/preprocessor_config.json\n",
1019
+ "Deleting older checkpoint [smangrul/xls-r-300m-mr/checkpoint-1200] due to args.save_total_limit\n",
1020
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.\n",
1021
+ "***** Running Evaluation *****\n",
1022
+ " Num examples = 306\n",
1023
+ " Batch size = 8\n",
1024
+ "Saving model checkpoint to smangrul/xls-r-300m-mr/checkpoint-2000\n",
1025
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-2000/config.json\n",
1026
+ "Model weights saved in smangrul/xls-r-300m-mr/checkpoint-2000/pytorch_model.bin\n",
1027
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-2000/preprocessor_config.json\n",
1028
+ "Deleting older checkpoint [smangrul/xls-r-300m-mr/checkpoint-1600] due to args.save_total_limit\n",
1029
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.\n",
1030
+ "***** Running Evaluation *****\n",
1031
+ " Num examples = 306\n",
1032
+ " Batch size = 8\n",
1033
+ "Saving model checkpoint to smangrul/xls-r-300m-mr/checkpoint-2400\n",
1034
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-2400/config.json\n",
1035
+ "Model weights saved in smangrul/xls-r-300m-mr/checkpoint-2400/pytorch_model.bin\n",
1036
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-2400/preprocessor_config.json\n",
1037
+ "Deleting older checkpoint [smangrul/xls-r-300m-mr/checkpoint-2000] due to args.save_total_limit\n",
1038
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.\n",
1039
+ "***** Running Evaluation *****\n",
1040
+ " Num examples = 306\n",
1041
+ " Batch size = 8\n",
1042
+ "Saving model checkpoint to smangrul/xls-r-300m-mr/checkpoint-2800\n",
1043
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-2800/config.json\n",
1044
+ "Model weights saved in smangrul/xls-r-300m-mr/checkpoint-2800/pytorch_model.bin\n",
1045
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-2800/preprocessor_config.json\n",
1046
+ "Deleting older checkpoint [smangrul/xls-r-300m-mr/checkpoint-2400] due to args.save_total_limit\n",
1047
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.\n",
1048
+ "***** Running Evaluation *****\n",
1049
+ " Num examples = 306\n",
1050
+ " Batch size = 8\n",
1051
+ "Saving model checkpoint to smangrul/xls-r-300m-mr/checkpoint-3200\n",
1052
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-3200/config.json\n",
1053
+ "Model weights saved in smangrul/xls-r-300m-mr/checkpoint-3200/pytorch_model.bin\n",
1054
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-3200/preprocessor_config.json\n",
1055
+ "Deleting older checkpoint [smangrul/xls-r-300m-mr/checkpoint-2800] due to args.save_total_limit\n",
1056
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.\n",
1057
+ "***** Running Evaluation *****\n",
1058
+ " Num examples = 306\n",
1059
+ " Batch size = 8\n",
1060
+ "Saving model checkpoint to smangrul/xls-r-300m-mr/checkpoint-3600\n",
1061
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-3600/config.json\n",
1062
+ "Model weights saved in smangrul/xls-r-300m-mr/checkpoint-3600/pytorch_model.bin\n",
1063
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-3600/preprocessor_config.json\n",
1064
+ "Deleting older checkpoint [smangrul/xls-r-300m-mr/checkpoint-3200] due to args.save_total_limit\n",
1065
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.\n",
1066
+ "***** Running Evaluation *****\n",
1067
+ " Num examples = 306\n",
1068
+ " Batch size = 8\n",
1069
+ "Saving model checkpoint to smangrul/xls-r-300m-mr/checkpoint-4000\n",
1070
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-4000/config.json\n",
1071
+ "Model weights saved in smangrul/xls-r-300m-mr/checkpoint-4000/pytorch_model.bin\n",
1072
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-4000/preprocessor_config.json\n",
1073
+ "Deleting older checkpoint [smangrul/xls-r-300m-mr/checkpoint-3600] due to args.save_total_limit\n",
1074
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.\n",
1075
+ "***** Running Evaluation *****\n",
1076
+ " Num examples = 306\n",
1077
+ " Batch size = 8\n",
1078
+ "Saving model checkpoint to smangrul/xls-r-300m-mr/checkpoint-4400\n",
1079
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-4400/config.json\n",
1080
+ "Model weights saved in smangrul/xls-r-300m-mr/checkpoint-4400/pytorch_model.bin\n",
1081
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-4400/preprocessor_config.json\n",
1082
+ "Deleting older checkpoint [smangrul/xls-r-300m-mr/checkpoint-4000] due to args.save_total_limit\n",
1083
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.\n",
1084
+ "***** Running Evaluation *****\n",
1085
+ " Num examples = 306\n",
1086
+ " Batch size = 8\n",
1087
+ "Saving model checkpoint to smangrul/xls-r-300m-mr/checkpoint-4800\n",
1088
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-4800/config.json\n",
1089
+ "Model weights saved in smangrul/xls-r-300m-mr/checkpoint-4800/pytorch_model.bin\n",
1090
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-4800/preprocessor_config.json\n",
1091
+ "Deleting older checkpoint [smangrul/xls-r-300m-mr/checkpoint-4400] due to args.save_total_limit\n",
1092
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.\n",
1093
+ "***** Running Evaluation *****\n",
1094
+ " Num examples = 306\n",
1095
+ " Batch size = 8\n",
1096
+ "Saving model checkpoint to smangrul/xls-r-300m-mr/checkpoint-5200\n",
1097
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-5200/config.json\n",
1098
+ "Model weights saved in smangrul/xls-r-300m-mr/checkpoint-5200/pytorch_model.bin\n",
1099
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-5200/preprocessor_config.json\n",
1100
+ "Deleting older checkpoint [smangrul/xls-r-300m-mr/checkpoint-4800] due to args.save_total_limit\n",
1101
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.\n",
1102
+ "***** Running Evaluation *****\n",
1103
+ " Num examples = 306\n",
1104
+ " Batch size = 8\n",
1105
+ "Saving model checkpoint to smangrul/xls-r-300m-mr/checkpoint-5600\n",
1106
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-5600/config.json\n",
1107
+ "Model weights saved in smangrul/xls-r-300m-mr/checkpoint-5600/pytorch_model.bin\n",
1108
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-5600/preprocessor_config.json\n",
1109
+ "Deleting older checkpoint [smangrul/xls-r-300m-mr/checkpoint-5200] due to args.save_total_limit\n",
1110
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.\n",
1111
+ "***** Running Evaluation *****\n",
1112
+ " Num examples = 306\n",
1113
+ " Batch size = 8\n",
1114
+ "Saving model checkpoint to smangrul/xls-r-300m-mr/checkpoint-6000\n",
1115
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-6000/config.json\n",
1116
+ "Model weights saved in smangrul/xls-r-300m-mr/checkpoint-6000/pytorch_model.bin\n",
1117
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-6000/preprocessor_config.json\n",
1118
+ "Deleting older checkpoint [smangrul/xls-r-300m-mr/checkpoint-5600] due to args.save_total_limit\n",
1119
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.\n",
1120
+ "***** Running Evaluation *****\n",
1121
+ " Num examples = 306\n",
1122
+ " Batch size = 8\n",
1123
+ "Saving model checkpoint to smangrul/xls-r-300m-mr/checkpoint-6400\n",
1124
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-6400/config.json\n",
1125
+ "Model weights saved in smangrul/xls-r-300m-mr/checkpoint-6400/pytorch_model.bin\n",
1126
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-6400/preprocessor_config.json\n",
1127
+ "Deleting older checkpoint [smangrul/xls-r-300m-mr/checkpoint-6000] due to args.save_total_limit\n",
1128
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.\n",
1129
+ "***** Running Evaluation *****\n",
1130
+ " Num examples = 306\n",
1131
+ " Batch size = 8\n",
1132
+ "Saving model checkpoint to smangrul/xls-r-300m-mr/checkpoint-6800\n",
1133
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-6800/config.json\n",
1134
+ "Model weights saved in smangrul/xls-r-300m-mr/checkpoint-6800/pytorch_model.bin\n",
1135
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-6800/preprocessor_config.json\n",
1136
+ "Deleting older checkpoint [smangrul/xls-r-300m-mr/checkpoint-6400] due to args.save_total_limit\n",
1137
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.\n",
1138
+ "***** Running Evaluation *****\n",
1139
+ " Num examples = 306\n",
1140
+ " Batch size = 8\n",
1141
+ "Saving model checkpoint to smangrul/xls-r-300m-mr/checkpoint-7200\n",
1142
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-7200/config.json\n",
1143
+ "Model weights saved in smangrul/xls-r-300m-mr/checkpoint-7200/pytorch_model.bin\n",
1144
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-7200/preprocessor_config.json\n",
1145
+ "Deleting older checkpoint [smangrul/xls-r-300m-mr/checkpoint-6800] due to args.save_total_limit\n",
1146
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.\n",
1147
+ "***** Running Evaluation *****\n",
1148
+ " Num examples = 306\n",
1149
+ " Batch size = 8\n",
1150
+ "Saving model checkpoint to smangrul/xls-r-300m-mr/checkpoint-7600\n",
1151
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-7600/config.json\n",
1152
+ "Model weights saved in smangrul/xls-r-300m-mr/checkpoint-7600/pytorch_model.bin\n",
1153
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-7600/preprocessor_config.json\n",
1154
+ "Deleting older checkpoint [smangrul/xls-r-300m-mr/checkpoint-7200] due to args.save_total_limit\n",
1155
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.\n",
1156
+ "***** Running Evaluation *****\n",
1157
+ " Num examples = 306\n",
1158
+ " Batch size = 8\n",
1159
+ "Saving model checkpoint to smangrul/xls-r-300m-mr/checkpoint-8000\n",
1160
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-8000/config.json\n",
1161
+ "Model weights saved in smangrul/xls-r-300m-mr/checkpoint-8000/pytorch_model.bin\n",
1162
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-8000/preprocessor_config.json\n",
1163
+ "Deleting older checkpoint [smangrul/xls-r-300m-mr/checkpoint-7600] due to args.save_total_limit\n",
1164
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.\n",
1165
+ "***** Running Evaluation *****\n",
1166
+ " Num examples = 306\n",
1167
+ " Batch size = 8\n",
1168
+ "Saving model checkpoint to smangrul/xls-r-300m-mr/checkpoint-8400\n",
1169
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-8400/config.json\n",
1170
+ "Model weights saved in smangrul/xls-r-300m-mr/checkpoint-8400/pytorch_model.bin\n",
1171
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-8400/preprocessor_config.json\n",
1172
+ "Deleting older checkpoint [smangrul/xls-r-300m-mr/checkpoint-8000] due to args.save_total_limit\n",
1173
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.\n",
1174
+ "***** Running Evaluation *****\n",
1175
+ " Num examples = 306\n",
1176
+ " Batch size = 8\n",
1177
+ "Saving model checkpoint to smangrul/xls-r-300m-mr/checkpoint-8800\n",
1178
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-8800/config.json\n",
1179
+ "Model weights saved in smangrul/xls-r-300m-mr/checkpoint-8800/pytorch_model.bin\n",
1180
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-8800/preprocessor_config.json\n",
1181
+ "Deleting older checkpoint [smangrul/xls-r-300m-mr/checkpoint-8400] due to args.save_total_limit\n",
1182
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.\n",
1183
+ "***** Running Evaluation *****\n",
1184
+ " Num examples = 306\n",
1185
+ " Batch size = 8\n",
1186
+ "Saving model checkpoint to smangrul/xls-r-300m-mr/checkpoint-9200\n",
1187
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-9200/config.json\n",
1188
+ "Model weights saved in smangrul/xls-r-300m-mr/checkpoint-9200/pytorch_model.bin\n",
1189
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-9200/preprocessor_config.json\n",
1190
+ "Deleting older checkpoint [smangrul/xls-r-300m-mr/checkpoint-8800] due to args.save_total_limit\n",
1191
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.\n",
1192
+ "***** Running Evaluation *****\n",
1193
+ " Num examples = 306\n",
1194
+ " Batch size = 8\n",
1195
+ "Saving model checkpoint to smangrul/xls-r-300m-mr/checkpoint-9600\n",
1196
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-9600/config.json\n",
1197
+ "Model weights saved in smangrul/xls-r-300m-mr/checkpoint-9600/pytorch_model.bin\n",
1198
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-9600/preprocessor_config.json\n",
1199
+ "Deleting older checkpoint [smangrul/xls-r-300m-mr/checkpoint-9200] due to args.save_total_limit\n",
1200
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.\n",
1201
+ "***** Running Evaluation *****\n",
1202
+ " Num examples = 306\n",
1203
+ " Batch size = 8\n",
1204
+ "Saving model checkpoint to smangrul/xls-r-300m-mr/checkpoint-10000\n",
1205
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-10000/config.json\n",
1206
+ "Model weights saved in smangrul/xls-r-300m-mr/checkpoint-10000/pytorch_model.bin\n",
1207
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-10000/preprocessor_config.json\n",
1208
+ "Deleting older checkpoint [smangrul/xls-r-300m-mr/checkpoint-9600] due to args.save_total_limit\n",
1209
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.\n",
1210
+ "***** Running Evaluation *****\n",
1211
+ " Num examples = 306\n",
1212
+ " Batch size = 8\n",
1213
+ "Saving model checkpoint to smangrul/xls-r-300m-mr/checkpoint-10400\n",
1214
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-10400/config.json\n",
1215
+ "Model weights saved in smangrul/xls-r-300m-mr/checkpoint-10400/pytorch_model.bin\n",
1216
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-10400/preprocessor_config.json\n",
1217
+ "Deleting older checkpoint [smangrul/xls-r-300m-mr/checkpoint-10000] due to args.save_total_limit\n",
1218
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.\n",
1219
+ "***** Running Evaluation *****\n",
1220
+ " Num examples = 306\n",
1221
+ " Batch size = 8\n",
1222
+ "Saving model checkpoint to smangrul/xls-r-300m-mr/checkpoint-10800\n",
1223
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-10800/config.json\n",
1224
+ "Model weights saved in smangrul/xls-r-300m-mr/checkpoint-10800/pytorch_model.bin\n",
1225
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-10800/preprocessor_config.json\n",
1226
+ "Deleting older checkpoint [smangrul/xls-r-300m-mr/checkpoint-10400] due to args.save_total_limit\n",
1227
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.\n",
1228
+ "***** Running Evaluation *****\n",
1229
+ " Num examples = 306\n",
1230
+ " Batch size = 8\n",
1231
+ "Saving model checkpoint to smangrul/xls-r-300m-mr/checkpoint-11200\n",
1232
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-11200/config.json\n",
1233
+ "Model weights saved in smangrul/xls-r-300m-mr/checkpoint-11200/pytorch_model.bin\n",
1234
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-11200/preprocessor_config.json\n",
1235
+ "Deleting older checkpoint [smangrul/xls-r-300m-mr/checkpoint-10800] due to args.save_total_limit\n",
1236
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.\n",
1237
+ "***** Running Evaluation *****\n",
1238
+ " Num examples = 306\n",
1239
+ " Batch size = 8\n",
1240
+ "Saving model checkpoint to smangrul/xls-r-300m-mr/checkpoint-11600\n",
1241
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-11600/config.json\n",
1242
+ "Model weights saved in smangrul/xls-r-300m-mr/checkpoint-11600/pytorch_model.bin\n",
1243
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-11600/preprocessor_config.json\n",
1244
+ "Deleting older checkpoint [smangrul/xls-r-300m-mr/checkpoint-11200] due to args.save_total_limit\n",
1245
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.\n",
1246
+ "***** Running Evaluation *****\n",
1247
+ " Num examples = 306\n",
1248
+ " Batch size = 8\n",
1249
+ "Saving model checkpoint to smangrul/xls-r-300m-mr/checkpoint-12000\n",
1250
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-12000/config.json\n",
1251
+ "Model weights saved in smangrul/xls-r-300m-mr/checkpoint-12000/pytorch_model.bin\n",
1252
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-12000/preprocessor_config.json\n",
1253
+ "Deleting older checkpoint [smangrul/xls-r-300m-mr/checkpoint-11600] due to args.save_total_limit\n",
1254
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.\n",
1255
+ "***** Running Evaluation *****\n",
1256
+ " Num examples = 306\n",
1257
+ " Batch size = 8\n",
1258
+ "Saving model checkpoint to smangrul/xls-r-300m-mr/checkpoint-12400\n",
1259
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-12400/config.json\n",
1260
+ "Model weights saved in smangrul/xls-r-300m-mr/checkpoint-12400/pytorch_model.bin\n",
1261
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-12400/preprocessor_config.json\n",
1262
+ "Deleting older checkpoint [smangrul/xls-r-300m-mr/checkpoint-12000] due to args.save_total_limit\n",
1263
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.\n",
1264
+ "***** Running Evaluation *****\n",
1265
+ " Num examples = 306\n",
1266
+ " Batch size = 8\n",
1267
+ "Saving model checkpoint to smangrul/xls-r-300m-mr/checkpoint-12800\n",
1268
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-12800/config.json\n",
1269
+ "Model weights saved in smangrul/xls-r-300m-mr/checkpoint-12800/pytorch_model.bin\n",
1270
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-12800/preprocessor_config.json\n",
1271
+ "Deleting older checkpoint [smangrul/xls-r-300m-mr/checkpoint-12400] due to args.save_total_limit\n",
1272
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.\n",
1273
+ "***** Running Evaluation *****\n",
1274
+ " Num examples = 306\n",
1275
+ " Batch size = 8\n",
1276
+ "Saving model checkpoint to smangrul/xls-r-300m-mr/checkpoint-13200\n",
1277
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-13200/config.json\n",
1278
+ "Model weights saved in smangrul/xls-r-300m-mr/checkpoint-13200/pytorch_model.bin\n",
1279
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-13200/preprocessor_config.json\n",
1280
+ "Deleting older checkpoint [smangrul/xls-r-300m-mr/checkpoint-12800] due to args.save_total_limit\n",
1281
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.\n",
1282
+ "***** Running Evaluation *****\n",
1283
+ " Num examples = 306\n",
1284
+ " Batch size = 8\n",
1285
+ "Saving model checkpoint to smangrul/xls-r-300m-mr/checkpoint-13600\n",
1286
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-13600/config.json\n",
1287
+ "Model weights saved in smangrul/xls-r-300m-mr/checkpoint-13600/pytorch_model.bin\n",
1288
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-13600/preprocessor_config.json\n",
1289
+ "Deleting older checkpoint [smangrul/xls-r-300m-mr/checkpoint-13200] due to args.save_total_limit\n",
1290
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.\n",
1291
+ "***** Running Evaluation *****\n",
1292
+ " Num examples = 306\n",
1293
+ " Batch size = 8\n",
1294
+ "Saving model checkpoint to smangrul/xls-r-300m-mr/checkpoint-14000\n",
1295
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-14000/config.json\n",
1296
+ "Model weights saved in smangrul/xls-r-300m-mr/checkpoint-14000/pytorch_model.bin\n",
1297
+ "Configuration saved in smangrul/xls-r-300m-mr/checkpoint-14000/preprocessor_config.json\n",
1298
+ "Deleting older checkpoint [smangrul/xls-r-300m-mr/checkpoint-13600] due to args.save_total_limit\n",
1299
+ "\n",
1300
+ "\n",
1301
+ "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
1302
+ "\n",
1303
+ "\n"
1304
+ ]
1305
+ },
1306
+ {
1307
+ "data": {
1308
+ "text/plain": [
1309
+ "TrainOutput(global_step=14200, training_loss=0.8374653981437146, metrics={'train_runtime': 38417.9883, 'train_samples_per_second': 11.802, 'train_steps_per_second': 0.37, 'total_flos': 9.128944889276437e+19, 'train_loss': 0.8374653981437146, 'epoch': 200.0})"
1310
+ ]
1311
+ },
1312
+ "execution_count": 18,
1313
+ "metadata": {},
1314
+ "output_type": "execute_result"
1315
+ }
1316
+ ],
1317
+ "source": [
1318
+ "trainer.train()"
1319
+ ]
1320
+ },
1321
+ {
1322
+ "cell_type": "code",
1323
+ "execution_count": 19,
1324
+ "metadata": {},
1325
+ "outputs": [
1326
+ {
1327
+ "name": "stderr",
1328
+ "output_type": "stream",
1329
+ "text": [
1330
+ "Saving model checkpoint to smangrul/xls-r-300m-mr\n",
1331
+ "Configuration saved in smangrul/xls-r-300m-mr/config.json\n",
1332
+ "Model weights saved in smangrul/xls-r-300m-mr/pytorch_model.bin\n",
1333
+ "Configuration saved in smangrul/xls-r-300m-mr/preprocessor_config.json\n"
1334
+ ]
1335
+ },
1336
+ {
1337
+ "data": {
1338
+ "application/vnd.jupyter.widget-view+json": {
1339
+ "model_id": "6d6ee5a61abd46f4a91acb7e34864e06",
1340
+ "version_major": 2,
1341
+ "version_minor": 0
1342
+ },
1343
+ "text/plain": [
1344
+ "Upload file pytorch_model.bin: 0%| | 3.39k/1.18G [00:00<?, ?B/s]"
1345
+ ]
1346
+ },
1347
+ "metadata": {},
1348
+ "output_type": "display_data"
1349
+ },
1350
+ {
1351
+ "name": "stderr",
1352
+ "output_type": "stream",
1353
+ "text": [
1354
+ "remote: -------------------------------------------------------------------------\u001b[31m \n",
1355
+ "remote: Your push was rejected because it contains files larger than 10M. \n",
1356
+ "remote: Please use https://git-lfs.github.com/ to store larger files.\u001b(B\u001b[m \n",
1357
+ "remote: ------------------------------------------------------------------------- \n",
1358
+ "remote: Offending files: \n",
1359
+ "remote: - language_model/unigrams.txt (ref: refs/heads/main) \n",
1360
+ "To https://huggingface.co/smangrul/xls-r-300m-mr\n",
1361
+ " ! [remote rejected] main -> main (pre-receive hook declined)\n",
1362
+ "error: failed to push some refs to 'https://user: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@huggingface.co/smangrul/xls-r-300m-mr'\n",
1363
+ "\n"
1364
+ ]
1365
+ },
1366
+ {
1367
+ "ename": "OSError",
1368
+ "evalue": "remote: -------------------------------------------------------------------------\u001b[31m \nremote: Your push was rejected because it contains files larger than 10M. \nremote: Please use https://git-lfs.github.com/ to store larger files.\u001b(B\u001b[m \nremote: ------------------------------------------------------------------------- \nremote: Offending files: \nremote: - language_model/unigrams.txt (ref: refs/heads/main) \nTo https://huggingface.co/smangrul/xls-r-300m-mr\n ! [remote rejected] main -> main (pre-receive hook declined)\nerror: failed to push some refs to 'https://user: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@huggingface.co/smangrul/xls-r-300m-mr'\n",
1369
+ "output_type": "error",
1370
+ "traceback": [
1371
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
1372
+ "\u001b[0;31mCalledProcessError\u001b[0m Traceback (most recent call last)",
1373
+ "\u001b[0;32m~/hf/lib/python3.7/site-packages/huggingface_hub/repository.py\u001b[0m in \u001b[0;36mgit_push\u001b[0;34m(self, upstream, blocking, auto_lfs_prune)\u001b[0m\n\u001b[1;32m 1018\u001b[0m raise subprocess.CalledProcessError(\n\u001b[0;32m-> 1019\u001b[0;31m \u001b[0mreturn_code\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprocess\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstdout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstderr\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstderr\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1020\u001b[0m )\n",
1374
+ "\u001b[0;31mCalledProcessError\u001b[0m: Command '['git', 'push', '--set-upstream', 'origin', 'main']' returned non-zero exit status 1.",
1375
+ "\nDuring handling of the above exception, another exception occurred:\n",
1376
+ "\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)",
1377
+ "\u001b[0;32m/tmp/ipykernel_39173/1405518398.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtrainer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpush_to_hub\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
1378
+ "\u001b[0;32m~/transformers/src/transformers/trainer.py\u001b[0m in \u001b[0;36mpush_to_hub\u001b[0;34m(self, commit_message, blocking, **kwargs)\u001b[0m\n\u001b[1;32m 2807\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2808\u001b[0m git_head_commit_url = self.repo.push_to_hub(\n\u001b[0;32m-> 2809\u001b[0;31m \u001b[0mcommit_message\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcommit_message\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mblocking\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mblocking\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mauto_lfs_prune\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2810\u001b[0m )\n\u001b[1;32m 2811\u001b[0m \u001b[0;31m# push separately the model card to be independant from the rest of the model\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
1379
+ "\u001b[0;32m~/hf/lib/python3.7/site-packages/huggingface_hub/repository.py\u001b[0m in \u001b[0;36mpush_to_hub\u001b[0;34m(self, commit_message, blocking, clean_ok, auto_lfs_prune)\u001b[0m\n\u001b[1;32m 1252\u001b[0m \u001b[0mupstream\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34mf\"origin {self.current_branch}\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1253\u001b[0m \u001b[0mblocking\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mblocking\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1254\u001b[0;31m \u001b[0mauto_lfs_prune\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mauto_lfs_prune\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1255\u001b[0m )\n\u001b[1;32m 1256\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
1380
+ "\u001b[0;32m~/hf/lib/python3.7/site-packages/huggingface_hub/repository.py\u001b[0m in \u001b[0;36mgit_push\u001b[0;34m(self, upstream, blocking, auto_lfs_prune)\u001b[0m\n\u001b[1;32m 1021\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1022\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0msubprocess\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mCalledProcessError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1023\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mEnvironmentError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstderr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1024\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1025\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mblocking\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
1381
+ "\u001b[0;31mOSError\u001b[0m: remote: -------------------------------------------------------------------------\u001b[31m \nremote: Your push was rejected because it contains files larger than 10M. \nremote: Please use https://git-lfs.github.com/ to store larger files.\u001b(B\u001b[m \nremote: ------------------------------------------------------------------------- \nremote: Offending files: \nremote: - language_model/unigrams.txt (ref: refs/heads/main) \nTo https://huggingface.co/smangrul/xls-r-300m-mr\n ! [remote rejected] main -> main (pre-receive hook declined)\nerror: failed to push some refs to 'https://user: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@huggingface.co/smangrul/xls-r-300m-mr'\n"
1382
+ ]
1383
+ }
1384
+ ],
1385
+ "source": [
1386
+ "trainer.push_to_hub()"
1387
+ ]
1388
+ },
1389
+ {
1390
+ "cell_type": "code",
1391
+ "execution_count": 30,
1392
+ "metadata": {},
1393
+ "outputs": [],
1394
+ "source": [
1395
+ "# train_dataset.save_to_disk(\"./Data/train_dataset\")"
1396
+ ]
1397
+ },
1398
+ {
1399
+ "cell_type": "code",
1400
+ "execution_count": 31,
1401
+ "metadata": {},
1402
+ "outputs": [],
1403
+ "source": [
1404
+ "# test_dataset.save_to_disk(\"./Data/test_dataset\")"
1405
+ ]
1406
+ },
1407
+ {
1408
+ "cell_type": "code",
1409
+ "execution_count": null,
1410
+ "metadata": {},
1411
+ "outputs": [],
1412
+ "source": []
1413
+ }
1414
+ ],
1415
+ "metadata": {
1416
+ "kernelspec": {
1417
+ "display_name": "hf",
1418
+ "language": "python",
1419
+ "name": "hf"
1420
+ },
1421
+ "language_info": {
1422
+ "codemirror_mode": {
1423
+ "name": "ipython",
1424
+ "version": 3
1425
+ },
1426
+ "file_extension": ".py",
1427
+ "mimetype": "text/x-python",
1428
+ "name": "python",
1429
+ "nbconvert_exporter": "python",
1430
+ "pygments_lexer": "ipython3",
1431
+ "version": "3.7.6"
1432
+ }
1433
+ },
1434
+ "nbformat": 4,
1435
+ "nbformat_minor": 4
1436
+ }