smangrul commited on
Commit
79c7c2b
1 Parent(s): 1abc3e4

Upload lm_ngram_decoder_training.ipynb

Browse files
Files changed (1) hide show
  1. lm_ngram_decoder_training.ipynb +648 -0
lm_ngram_decoder_training.ipynb ADDED
@@ -0,0 +1,648 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 41,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from datasets import load_dataset, concatenate_datasets\n"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": 70,
15
+ "metadata": {},
16
+ "outputs": [
17
+ {
18
+ "name": "stderr",
19
+ "output_type": "stream",
20
+ "text": [
21
+ "Reusing dataset common_voice (/home/ubuntu/.cache/huggingface/datasets/mozilla-foundation___common_voice/mr/8.0.0/b8bc4d453193c06a43269b46cd87f075c70f152ac963b7f28f7a2760c45ec3e8)\n"
22
+ ]
23
+ },
24
+ {
25
+ "name": "stdout",
26
+ "output_type": "stream",
27
+ "text": [
28
+ "Dataset({\n",
29
+ " features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],\n",
30
+ " num_rows: 698\n",
31
+ "})\n"
32
+ ]
33
+ },
34
+ {
35
+ "name": "stderr",
36
+ "output_type": "stream",
37
+ "text": [
38
+ "Reusing dataset open_slr (/home/ubuntu/.cache/huggingface/datasets/open_slr/SLR64/0.0.0/e0fb9e36094eff565efe812d1aba158f6a46ce834cb9705c91d1e2d6ba78ed31)\n"
39
+ ]
40
+ },
41
+ {
42
+ "name": "stdout",
43
+ "output_type": "stream",
44
+ "text": [
45
+ "Dataset({\n",
46
+ " features: ['path', 'audio', 'sentence'],\n",
47
+ " num_rows: 1569\n",
48
+ "})\n"
49
+ ]
50
+ },
51
+ {
52
+ "name": "stderr",
53
+ "output_type": "stream",
54
+ "text": [
55
+ "Using custom data configuration shivam--marathi_samanantar_processed-538aa7995793bd87\n",
56
+ "Reusing dataset parquet (/home/ubuntu/.cache/huggingface/datasets/parquet/shivam--marathi_samanantar_processed-538aa7995793bd87/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)\n"
57
+ ]
58
+ },
59
+ {
60
+ "name": "stdout",
61
+ "output_type": "stream",
62
+ "text": [
63
+ "Dataset({\n",
64
+ " features: ['text'],\n",
65
+ " num_rows: 3047226\n",
66
+ "})\n"
67
+ ]
68
+ },
69
+ {
70
+ "name": "stderr",
71
+ "output_type": "stream",
72
+ "text": [
73
+ "Using custom data configuration shivam--marathi_pib_processed-2348554e5319bdfe\n",
74
+ "Reusing dataset parquet (/home/ubuntu/.cache/huggingface/datasets/parquet/shivam--marathi_pib_processed-2348554e5319bdfe/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)\n"
75
+ ]
76
+ },
77
+ {
78
+ "name": "stdout",
79
+ "output_type": "stream",
80
+ "text": [
81
+ "Dataset({\n",
82
+ " features: ['text'],\n",
83
+ " num_rows: 117199\n",
84
+ "})\n"
85
+ ]
86
+ },
87
+ {
88
+ "name": "stderr",
89
+ "output_type": "stream",
90
+ "text": [
91
+ "Reusing dataset opus100 (/home/ubuntu/.cache/huggingface/datasets/opus100/en-mr/0.0.0/256f3196b69901fb0c79810ef468e2c4ed84fbd563719920b1ff1fdc750f7704)\n",
92
+ "Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/opus100/en-mr/0.0.0/256f3196b69901fb0c79810ef468e2c4ed84fbd563719920b1ff1fdc750f7704/cache-201d21d7acc2864f.arrow\n"
93
+ ]
94
+ },
95
+ {
96
+ "name": "stdout",
97
+ "output_type": "stream",
98
+ "text": [
99
+ "Dataset({\n",
100
+ " features: ['translation', 'sentence'],\n",
101
+ " num_rows: 27007\n",
102
+ "})\n"
103
+ ]
104
+ },
105
+ {
106
+ "name": "stderr",
107
+ "output_type": "stream",
108
+ "text": [
109
+ "Reusing dataset tatoeba (/home/ubuntu/.cache/huggingface/datasets/tatoeba/en-mr/2021.7.22/b3ea9c6bb2af47699c5fc0a155643f5a0da287c7095ea14824ee0a8afd74daf6)\n"
110
+ ]
111
+ },
112
+ {
113
+ "data": {
114
+ "application/vnd.jupyter.widget-view+json": {
115
+ "model_id": "c0dba507cea344768aa20cd7c5593a0c",
116
+ "version_major": 2,
117
+ "version_minor": 0
118
+ },
119
+ "text/plain": [
120
+ " 0%| | 0/53462 [00:00<?, ?ex/s]"
121
+ ]
122
+ },
123
+ "metadata": {},
124
+ "output_type": "display_data"
125
+ },
126
+ {
127
+ "name": "stdout",
128
+ "output_type": "stream",
129
+ "text": [
130
+ "Dataset({\n",
131
+ " features: ['id', 'translation', 'sentence'],\n",
132
+ " num_rows: 53462\n",
133
+ "})\n"
134
+ ]
135
+ },
136
+ {
137
+ "name": "stderr",
138
+ "output_type": "stream",
139
+ "text": [
140
+ "Reusing dataset tapaco (/home/ubuntu/.cache/huggingface/datasets/tapaco/mr/1.0.0/71d200534b520a174927a8f0479c06220a0a6fb5201a84ebfce19006c6354698)\n"
141
+ ]
142
+ },
143
+ {
144
+ "name": "stdout",
145
+ "output_type": "stream",
146
+ "text": [
147
+ "Dataset({\n",
148
+ " features: ['paraphrase_set_id', 'sentence_id', 'paraphrase', 'lists', 'tags', 'language'],\n",
149
+ " num_rows: 16413\n",
150
+ "})\n"
151
+ ]
152
+ }
153
+ ],
154
+ "source": [
155
+ "cv = load_dataset(\"mozilla-foundation/common_voice_8_0\", \"mr\", split=\"train+validation\", use_auth_token=True)\n",
156
+ "print(cv)\n",
157
+ "openslr = load_dataset(\"openslr\", \"SLR64\", split=\"train\")\n",
158
+ "print(openslr)\n",
159
+ "samanantar = load_dataset(\"shivam/marathi_samanantar_processed\", split=\"train\")\n",
160
+ "print(samanantar)\n",
161
+ "pib = load_dataset(\"shivam/marathi_pib_processed\", split=\"train\")\n",
162
+ "print(pib)\n",
163
+ "opus = load_dataset(\"opus100\", \"en-mr\", split=\"train\").map(lambda example: {\"sentence\": example[\"translation\"][\"mr\"]})\n",
164
+ "print(opus)\n",
165
+ "tatoeba = load_dataset(\"tatoeba\", \"en-mr\", split=\"train\").map(lambda example: {\"sentence\": example[\"translation\"][\"mr\"]})\n",
166
+ "print(tatoeba)\n",
167
+ "tapaco = load_dataset(\"tapaco\", \"mr\", split=\"train\")\n",
168
+ "print(tapaco)\n",
169
+ "\n"
170
+ ]
171
+ },
172
+ {
173
+ "cell_type": "code",
174
+ "execution_count": 71,
175
+ "metadata": {},
176
+ "outputs": [
177
+ {
178
+ "data": {
179
+ "text/plain": [
180
+ "Dataset({\n",
181
+ " features: ['sentence'],\n",
182
+ " num_rows: 3263574\n",
183
+ "})"
184
+ ]
185
+ },
186
+ "execution_count": 71,
187
+ "metadata": {},
188
+ "output_type": "execute_result"
189
+ }
190
+ ],
191
+ "source": [
192
+ "cv = cv.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"segment\", \"up_votes\", 'path', 'audio'])\n",
193
+ "openslr = openslr.remove_columns(['path', 'audio'])\n",
194
+ "samanantar = samanantar.rename_column(\"text\",\"sentence\")\n",
195
+ "pib = pib.rename_column(\"text\",\"sentence\")\n",
196
+ "opus = opus.remove_columns([\"translation\"])\n",
197
+ "tatoeba = tatoeba.remove_columns(['id','translation'])\n",
198
+ "tapaco = tapaco.remove_columns(['paraphrase_set_id', 'sentence_id', 'lists', 'tags', 'language']).rename_column(\"paraphrase\",\"sentence\")\n",
199
+ "\n",
200
+ "text_dataset = concatenate_datasets([cv, openslr, samanantar, pib, opus, tatoeba, tapaco])\n",
201
+ "text_dataset\n"
202
+ ]
203
+ },
204
+ {
205
+ "cell_type": "code",
206
+ "execution_count": 73,
207
+ "metadata": {},
208
+ "outputs": [],
209
+ "source": [
210
+ "chars_to_ignore_regex = '[,?.!\\-\\;\\:\"“%‘”�—’…–\\।\\!\\\"\\,\\-\\.\\?\\:\\|\\“\\”\\–\\;\\'\\’\\‘\\॔]' # change to the ignored characters of your fine-tuned model"
211
+ ]
212
+ },
213
+ {
214
+ "cell_type": "code",
215
+ "execution_count": 74,
216
+ "metadata": {},
217
+ "outputs": [],
218
+ "source": [
219
+ "import re\n",
220
+ "\n",
221
+ "def extract_text(batch):\n",
222
+ " text = batch[\"sentence\"]\n",
223
+ " batch[\"text\"] = re.sub(chars_to_ignore_regex, \"\", text.lower())\n",
224
+ " return batch"
225
+ ]
226
+ },
227
+ {
228
+ "cell_type": "code",
229
+ "execution_count": 76,
230
+ "metadata": {},
231
+ "outputs": [
232
+ {
233
+ "data": {
234
+ "application/vnd.jupyter.widget-view+json": {
235
+ "model_id": "4334d72e02f140bf9078cb97c5353d70",
236
+ "version_major": 2,
237
+ "version_minor": 0
238
+ },
239
+ "text/plain": [
240
+ " 0%| | 0/3263574 [00:00<?, ?ex/s]"
241
+ ]
242
+ },
243
+ "metadata": {},
244
+ "output_type": "display_data"
245
+ },
246
+ {
247
+ "data": {
248
+ "text/plain": [
249
+ "Dataset({\n",
250
+ " features: ['text'],\n",
251
+ " num_rows: 3263574\n",
252
+ "})"
253
+ ]
254
+ },
255
+ "execution_count": 76,
256
+ "metadata": {},
257
+ "output_type": "execute_result"
258
+ }
259
+ ],
260
+ "source": [
261
+ "dataset = text_dataset.map(extract_text, remove_columns=text_dataset.column_names)\n",
262
+ "dataset"
263
+ ]
264
+ },
265
+ {
266
+ "cell_type": "code",
267
+ "execution_count": 77,
268
+ "metadata": {},
269
+ "outputs": [
270
+ {
271
+ "data": {
272
+ "text/plain": [
273
+ "{'text': 'शिवाय त्यांना कवितेचा आणि चित्रकलेचा छंद होता'}"
274
+ ]
275
+ },
276
+ "execution_count": 77,
277
+ "metadata": {},
278
+ "output_type": "execute_result"
279
+ }
280
+ ],
281
+ "source": [
282
+ "dataset[0]"
283
+ ]
284
+ },
285
+ {
286
+ "cell_type": "code",
287
+ "execution_count": 78,
288
+ "metadata": {},
289
+ "outputs": [],
290
+ "source": [
291
+ "with open(\"text.txt\", \"w\") as file:\n",
292
+ " file.write(\" \".join(dataset[\"text\"]))"
293
+ ]
294
+ },
295
+ {
296
+ "cell_type": "code",
297
+ "execution_count": 82,
298
+ "metadata": {},
299
+ "outputs": [
300
+ {
301
+ "name": "stdout",
302
+ "output_type": "stream",
303
+ "text": [
304
+ "=== 1/5 Counting and sorting n-grams ===\n",
305
+ "Reading /ebs/learn/ASR/text.txt\n",
306
+ "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
307
+ "****************************************************************************************************\n",
308
+ "Unigram tokens 29706056 types 776336\n",
309
+ "=== 2/5 Calculating and sorting adjusted counts ===\n",
310
+ "Chain sizes: 1:9316032 2:20102516736 3:37692219392 4:60307550208 5:87948517376\n",
311
+ "Statistics:\n",
312
+ "1 776335 D1=0.705463 D2=1.0456 D3+=1.33671\n",
313
+ "2 8433103 D1=0.790673 D2=1.11187 D3+=1.35296\n",
314
+ "3 18421039 D1=0.878727 D2=1.22916 D3+=1.39519\n",
315
+ "4 24029132 D1=0.935948 D2=1.36969 D3+=1.49375\n",
316
+ "5 26433229 D1=0.885046 D2=1.58244 D3+=2.0281\n",
317
+ "Memory estimate for binary LM:\n",
318
+ "type MB\n",
319
+ "probing 1637 assuming -p 1.5\n",
320
+ "probing 1931 assuming -r models -p 1.5\n",
321
+ "trie 833 without quantization\n",
322
+ "trie 476 assuming -q 8 -b 8 quantization \n",
323
+ "trie 726 assuming -a 22 array pointer compression\n",
324
+ "trie 368 assuming -a 22 -q 8 -b 8 array pointer compression and quantization\n",
325
+ "=== 3/5 Calculating and sorting initial probabilities ===\n",
326
+ "Chain sizes: 1:9316020 2:134929648 3:368420780 4:576699168 5:740130412\n",
327
+ "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
328
+ "####################################################################################################\n",
329
+ "=== 4/5 Calculating and writing order-interpolated probabilities ===\n",
330
+ "Chain sizes: 1:9316020 2:134929648 3:368420780 4:576699168 5:740130412\n",
331
+ "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
332
+ "####################################################################################################\n",
333
+ "=== 5/5 Writing ARPA model ===\n",
334
+ "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
335
+ "****************************************************************************************************\n",
336
+ "Name:lmplz\tVmPeak:201429316 kB\tVmRSS:29888 kB\tRSSMax:36259508 kB\tuser:86.1274\tsys:40.4955\tCPU:126.623\treal:99.6214\n"
337
+ ]
338
+ }
339
+ ],
340
+ "source": [
341
+ "!kenlm/build/bin/lmplz -o 5 <\"text.txt\" > \"5gram.arpa\""
342
+ ]
343
+ },
344
+ {
345
+ "cell_type": "code",
346
+ "execution_count": 83,
347
+ "metadata": {},
348
+ "outputs": [
349
+ {
350
+ "name": "stdout",
351
+ "output_type": "stream",
352
+ "text": [
353
+ "\\data\\\r\n",
354
+ "ngram 1=776335\r\n",
355
+ "ngram 2=8433103\r\n",
356
+ "ngram 3=18421039\r\n",
357
+ "ngram 4=24029132\r\n",
358
+ "ngram 5=26433229\r\n",
359
+ "\r\n",
360
+ "\\1-grams:\r\n",
361
+ "-6.9649706\t<unk>\t0\r\n",
362
+ "0\t<s>\t-0.10200334\r\n",
363
+ "-3.8677218\tशिवाय\t-0.29601222\r\n",
364
+ "-3.0139472\tत्यांना\t-0.54708624\r\n",
365
+ "-5.7931695\tकवितेचा\t-0.10200334\r\n",
366
+ "-2.2375891\tआणि\t-0.5685015\r\n",
367
+ "-6.046465\tचित्रकलेचा\t-0.16192785\r\n",
368
+ "-4.874536\tछंद\t-0.3758324\r\n",
369
+ "-3.150044\tहोता\t-0.53179973\r\n",
370
+ "-6.514799\tपारंपरिकदृष्ट्या\t-0.10200334\r\n",
371
+ "-4.837577\tज्वारी\t-0.3880814\r\n",
372
+ "-4.9689674\tबाजरी\t-0.32780117\r\n"
373
+ ]
374
+ }
375
+ ],
376
+ "source": [
377
+ "!head -20 5gram.arpa"
378
+ ]
379
+ },
380
+ {
381
+ "cell_type": "code",
382
+ "execution_count": 85,
383
+ "metadata": {},
384
+ "outputs": [],
385
+ "source": [
386
+ "with open(\"5gram.arpa\", \"r\") as read_file, open(\"5gram_correct.arpa\", \"w\") as write_file:\n",
387
+ " has_added_eos = False\n",
388
+ " for line in read_file:\n",
389
+ " if not has_added_eos and \"ngram 1=\" in line:\n",
390
+ " count=line.strip().split(\"=\")[-1]\n",
391
+ " write_file.write(line.replace(f\"{count}\", f\"{int(count)+1}\"))\n",
392
+ " elif not has_added_eos and \"<s>\" in line:\n",
393
+ " write_file.write(line)\n",
394
+ " write_file.write(line.replace(\"<s>\", \"</s>\"))\n",
395
+ " has_added_eos = True\n",
396
+ " else:\n",
397
+ " write_file.write(line)"
398
+ ]
399
+ },
400
+ {
401
+ "cell_type": "code",
402
+ "execution_count": 86,
403
+ "metadata": {},
404
+ "outputs": [
405
+ {
406
+ "name": "stdout",
407
+ "output_type": "stream",
408
+ "text": [
409
+ "\\data\\\r\n",
410
+ "ngram 1=776336\r\n",
411
+ "ngram 2=8433103\r\n",
412
+ "ngram 3=18421039\r\n",
413
+ "ngram 4=24029132\r\n",
414
+ "ngram 5=26433229\r\n",
415
+ "\r\n",
416
+ "\\1-grams:\r\n",
417
+ "-6.9649706\t<unk>\t0\r\n",
418
+ "0\t<s>\t-0.10200334\r\n",
419
+ "0\t</s>\t-0.10200334\r\n",
420
+ "-3.8677218\tशिवाय\t-0.29601222\r\n",
421
+ "-3.0139472\tत्यांना\t-0.54708624\r\n",
422
+ "-5.7931695\tकवितेचा\t-0.10200334\r\n",
423
+ "-2.2375891\tआणि\t-0.5685015\r\n",
424
+ "-6.046465\tचित्रकलेचा\t-0.16192785\r\n",
425
+ "-4.874536\tछंद\t-0.3758324\r\n",
426
+ "-3.150044\tहोता\t-0.53179973\r\n",
427
+ "-6.514799\tपारंपरिकदृष्ट्या\t-0.10200334\r\n",
428
+ "-4.837577\tज्वारी\t-0.3880814\r\n"
429
+ ]
430
+ }
431
+ ],
432
+ "source": [
433
+ "!head -20 5gram_correct.arpa"
434
+ ]
435
+ },
436
+ {
437
+ "cell_type": "code",
438
+ "execution_count": 87,
439
+ "metadata": {},
440
+ "outputs": [],
441
+ "source": [
442
+ "from transformers import AutoProcessor\n",
443
+ "\n",
444
+ "processor = AutoProcessor.from_pretrained(\"smangrul/xls-r-300m-mr\")"
445
+ ]
446
+ },
447
+ {
448
+ "cell_type": "code",
449
+ "execution_count": 88,
450
+ "metadata": {},
451
+ "outputs": [
452
+ {
453
+ "data": {
454
+ "text/plain": [
455
+ "{'|': 0,\n",
456
+ " 'ँ': 1,\n",
457
+ " 'ं': 2,\n",
458
+ " 'ः': 3,\n",
459
+ " 'अ': 4,\n",
460
+ " 'आ': 5,\n",
461
+ " 'इ': 6,\n",
462
+ " 'ई': 7,\n",
463
+ " 'उ': 8,\n",
464
+ " 'ऊ': 9,\n",
465
+ " 'ऋ': 10,\n",
466
+ " 'ए': 11,\n",
467
+ " 'ऐ': 12,\n",
468
+ " 'ऑ': 13,\n",
469
+ " 'ओ': 14,\n",
470
+ " 'औ': 15,\n",
471
+ " 'क': 16,\n",
472
+ " 'ख': 17,\n",
473
+ " 'ग': 18,\n",
474
+ " 'घ': 19,\n",
475
+ " 'च': 20,\n",
476
+ " 'छ': 21,\n",
477
+ " 'ज': 22,\n",
478
+ " 'झ': 23,\n",
479
+ " 'ञ': 24,\n",
480
+ " 'ट': 25,\n",
481
+ " 'ठ': 26,\n",
482
+ " 'ड': 27,\n",
483
+ " 'ढ': 28,\n",
484
+ " 'ण': 29,\n",
485
+ " 'त': 30,\n",
486
+ " 'थ': 31,\n",
487
+ " 'द': 32,\n",
488
+ " 'ध': 33,\n",
489
+ " 'न': 34,\n",
490
+ " 'प': 35,\n",
491
+ " 'फ': 36,\n",
492
+ " 'ब': 37,\n",
493
+ " 'भ': 38,\n",
494
+ " 'म': 39,\n",
495
+ " 'य': 40,\n",
496
+ " 'र': 41,\n",
497
+ " 'ऱ': 42,\n",
498
+ " 'ल': 43,\n",
499
+ " 'ळ': 44,\n",
500
+ " 'व': 45,\n",
501
+ " 'श': 46,\n",
502
+ " 'ष': 47,\n",
503
+ " 'स': 48,\n",
504
+ " 'ह': 49,\n",
505
+ " '़': 50,\n",
506
+ " 'ा': 51,\n",
507
+ " 'ि': 52,\n",
508
+ " 'ी': 53,\n",
509
+ " 'ु': 54,\n",
510
+ " 'ू': 55,\n",
511
+ " 'ृ': 56,\n",
512
+ " 'ॄ': 57,\n",
513
+ " 'ॅ': 58,\n",
514
+ " 'े': 59,\n",
515
+ " 'ै': 60,\n",
516
+ " 'ॉ': 61,\n",
517
+ " 'ॊ': 62,\n",
518
+ " 'ो': 63,\n",
519
+ " 'ौ': 64,\n",
520
+ " '्': 65,\n",
521
+ " 'ॲ': 66,\n",
522
+ " '[unk]': 67,\n",
523
+ " '[pad]': 68,\n",
524
+ " '<s>': 69,\n",
525
+ " '</s>': 70}"
526
+ ]
527
+ },
528
+ "execution_count": 88,
529
+ "metadata": {},
530
+ "output_type": "execute_result"
531
+ }
532
+ ],
533
+ "source": [
534
+ "vocab_dict = processor.tokenizer.get_vocab()\n",
535
+ "sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}\n",
536
+ "sorted_vocab_dict\n"
537
+ ]
538
+ },
539
+ {
540
+ "cell_type": "code",
541
+ "execution_count": 89,
542
+ "metadata": {},
543
+ "outputs": [
544
+ {
545
+ "name": "stderr",
546
+ "output_type": "stream",
547
+ "text": [
548
+ "Loading the LM will be faster if you build a binary file.\n",
549
+ "Reading /ebs/learn/ASR/5gram_correct.arpa\n",
550
+ "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
551
+ "****************************************************************************************************\n",
552
+ "Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?\n",
553
+ "Unigrams and labels don't seem to agree.\n"
554
+ ]
555
+ }
556
+ ],
557
+ "source": [
558
+ "from pyctcdecode import build_ctcdecoder\n",
559
+ "\n",
560
+ "decoder = build_ctcdecoder(\n",
561
+ " labels=list(sorted_vocab_dict.keys()),\n",
562
+ " kenlm_model_path=\"5gram_correct.arpa\",\n",
563
+ ")"
564
+ ]
565
+ },
566
+ {
567
+ "cell_type": "code",
568
+ "execution_count": 90,
569
+ "metadata": {},
570
+ "outputs": [
571
+ {
572
+ "data": {
573
+ "text/plain": [
574
+ "<pyctcdecode.decoder.BeamSearchDecoderCTC at 0x7fe8a63c65d0>"
575
+ ]
576
+ },
577
+ "execution_count": 90,
578
+ "metadata": {},
579
+ "output_type": "execute_result"
580
+ }
581
+ ],
582
+ "source": [
583
+ "decoder"
584
+ ]
585
+ },
586
+ {
587
+ "cell_type": "code",
588
+ "execution_count": 91,
589
+ "metadata": {},
590
+ "outputs": [],
591
+ "source": [
592
+ "from transformers import Wav2Vec2ProcessorWithLM\n",
593
+ "\n",
594
+ "processor_with_lm = Wav2Vec2ProcessorWithLM(\n",
595
+ " feature_extractor=processor.feature_extractor,\n",
596
+ " tokenizer=processor.tokenizer,\n",
597
+ " decoder=decoder\n",
598
+ ")"
599
+ ]
600
+ },
601
+ {
602
+ "cell_type": "code",
603
+ "execution_count": 92,
604
+ "metadata": {},
605
+ "outputs": [],
606
+ "source": [
607
+ "processor_with_lm.save_pretrained(\"./smangrul/xls-r-300m-mr/\")"
608
+ ]
609
+ },
610
+ {
611
+ "cell_type": "code",
612
+ "execution_count": 95,
613
+ "metadata": {},
614
+ "outputs": [],
615
+ "source": [
616
+ "processor_with_lm.save_pretrained(\"./../xls-r-300m-mr-model/\")"
617
+ ]
618
+ },
619
+ {
620
+ "cell_type": "code",
621
+ "execution_count": null,
622
+ "metadata": {},
623
+ "outputs": [],
624
+ "source": []
625
+ }
626
+ ],
627
+ "metadata": {
628
+ "kernelspec": {
629
+ "display_name": "hf",
630
+ "language": "python",
631
+ "name": "hf"
632
+ },
633
+ "language_info": {
634
+ "codemirror_mode": {
635
+ "name": "ipython",
636
+ "version": 3
637
+ },
638
+ "file_extension": ".py",
639
+ "mimetype": "text/x-python",
640
+ "name": "python",
641
+ "nbconvert_exporter": "python",
642
+ "pygments_lexer": "ipython3",
643
+ "version": "3.7.6"
644
+ }
645
+ },
646
+ "nbformat": 4,
647
+ "nbformat_minor": 4
648
+ }