chmanoj commited on
Commit
c82a77a
1 Parent(s): 2ea8870

Update src files

Browse files
.gitattributes CHANGED
@@ -1,6 +1,6 @@
1
  *.7z filter=lfs diff=lfs merge=lfs -text
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
- text.txt filter=lfs diff=lfs merge=lfs -text
4
  *.bin filter=lfs diff=lfs merge=lfs -text
5
  *.bin.* filter=lfs diff=lfs merge=lfs -text
6
  *.bz2 filter=lfs diff=lfs merge=lfs -text
 
1
  *.7z filter=lfs diff=lfs merge=lfs -text
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *kenlm_text_te.txt filter=lfs diff=lfs merge=lfs -text
4
  *.bin filter=lfs diff=lfs merge=lfs -text
5
  *.bin.* filter=lfs diff=lfs merge=lfs -text
6
  *.bz2 filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .ipynb_checkpoints/
src/Create_LM.ipynb CHANGED
@@ -2,8 +2,8 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 2,
6
- "id": "5e445ce4-1507-482d-a2a8-03d8802e6311",
7
  "metadata": {},
8
  "outputs": [],
9
  "source": [
@@ -12,14 +12,14 @@
12
  },
13
  {
14
  "cell_type": "code",
15
- "execution_count": 3,
16
- "id": "1c1820bc-0125-4589-983f-e454801435a5",
17
  "metadata": {},
18
  "outputs": [
19
  {
20
  "data": {
21
  "application/vnd.jupyter.widget-view+json": {
22
- "model_id": "117e880c8ae8437e9a16ccdf20b659eb",
23
  "version_major": 2,
24
  "version_minor": 0
25
  },
@@ -34,20 +34,20 @@
34
  "name": "stderr",
35
  "output_type": "stream",
36
  "text": [
37
- "Using custom data configuration chmanoj--ai4bharat__samanantar_processed_te-a0473fa2e2573d48\n"
38
  ]
39
  },
40
  {
41
  "name": "stdout",
42
  "output_type": "stream",
43
  "text": [
44
- "Downloading and preparing dataset samanantar/te (download: 292.93 MiB, generated: 678.62 MiB, post-processed: Unknown size, total: 971.55 MiB) to /home/manoj/.cache/huggingface/datasets/parquet/chmanoj--ai4bharat__samanantar_processed_te-a0473fa2e2573d48/0.0.0/1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121...\n"
45
  ]
46
  },
47
  {
48
  "data": {
49
  "application/vnd.jupyter.widget-view+json": {
50
- "model_id": "bd1bfffa9a424a45b3b7324458818f4a",
51
  "version_major": 2,
52
  "version_minor": 0
53
  },
@@ -61,7 +61,7 @@
61
  {
62
  "data": {
63
  "application/vnd.jupyter.widget-view+json": {
64
- "model_id": "22a24004a7a546ea88bf7c3fe1c16e46",
65
  "version_major": 2,
66
  "version_minor": 0
67
  },
@@ -75,7 +75,7 @@
75
  {
76
  "data": {
77
  "application/vnd.jupyter.widget-view+json": {
78
- "model_id": "9e4a161541734dfbb2de2d3dd46d8753",
79
  "version_major": 2,
80
  "version_minor": 0
81
  },
@@ -89,7 +89,7 @@
89
  {
90
  "data": {
91
  "application/vnd.jupyter.widget-view+json": {
92
- "model_id": "992db97134c94b9284b421c7f3ea0b33",
93
  "version_major": 2,
94
  "version_minor": 0
95
  },
@@ -104,7 +104,7 @@
104
  "name": "stdout",
105
  "output_type": "stream",
106
  "text": [
107
- "Dataset parquet downloaded and prepared to /home/manoj/.cache/huggingface/datasets/parquet/chmanoj--ai4bharat__samanantar_processed_te-a0473fa2e2573d48/0.0.0/1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121. Subsequent calls will reuse this data.\n"
108
  ]
109
  }
110
  ],
@@ -115,18 +115,18 @@
115
  {
116
  "cell_type": "code",
117
  "execution_count": 4,
118
- "id": "62fb01f7-24fe-4384-9940-3c262c321a5d",
119
  "metadata": {},
120
  "outputs": [],
121
  "source": [
122
- "with open(\"text.txt\", \"w\") as file:\n",
123
  " file.write(\" \".join(dataset[\"text\"]))"
124
  ]
125
  },
126
  {
127
  "cell_type": "code",
128
  "execution_count": null,
129
- "id": "4295ab4b-b4d8-4a39-a896-fb86503e4674",
130
  "metadata": {},
131
  "outputs": [],
132
  "source": []
@@ -134,13 +134,13 @@
134
  {
135
  "cell_type": "code",
136
  "execution_count": 5,
137
- "id": "fcc0b573-516a-45d6-af2a-feace521c16d",
138
  "metadata": {},
139
  "outputs": [
140
  {
141
  "data": {
142
  "text/plain": [
143
- "'/mnt/c/Projects/Speech/xls-R-finetuning/lm_te'"
144
  ]
145
  },
146
  "execution_count": 5,
@@ -155,8 +155,8 @@
155
  },
156
  {
157
  "cell_type": "code",
158
- "execution_count": 6,
159
- "id": "e1f8f887-6201-4ae0-989e-8bdc57816db1",
160
  "metadata": {},
161
  "outputs": [
162
  {
@@ -164,12 +164,12 @@
164
  "output_type": "stream",
165
  "text": [
166
  "=== 1/5 Counting and sorting n-grams ===\n",
167
- "Reading /mnt/c/Projects/Speech/xls-R-finetuning/lm_te/text.txt\n",
168
  "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
169
  "****************************************************************************************************\n",
170
  "Unigram tokens 32852369 types 1308846\n",
171
  "=== 2/5 Calculating and sorting adjusted counts ===\n",
172
- "Chain sizes: 1:15706152 2:2291295744 3:4296179712\n",
173
  "Statistics:\n",
174
  "1 1308845 D1=0.726852 D2=1.02775 D3+=1.30996\n",
175
  "2 12720239 D1=0.818931 D2=1.12897 D3+=1.32699\n",
@@ -193,18 +193,18 @@
193
  "=== 5/5 Writing ARPA model ===\n",
194
  "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
195
  "****************************************************************************************************\n",
196
- "Name:lmplz\tVmPeak:6613460 kB\tVmRSS:37976 kB\tRSSMax:1975488 kB\tuser:33.1964\tsys:9.29228\tCPU:42.4891\treal:65.5831\n"
197
  ]
198
  }
199
  ],
200
  "source": [
201
- "!../kenlm/build/bin/lmplz -o 3 <\"text.txt\" > \"3gram.arpa\""
202
  ]
203
  },
204
  {
205
  "cell_type": "code",
206
- "execution_count": null,
207
- "id": "afee7f94-f247-4891-822e-1f4edd5abc81",
208
  "metadata": {},
209
  "outputs": [
210
  {
@@ -212,12 +212,12 @@
212
  "output_type": "stream",
213
  "text": [
214
  "=== 1/5 Counting and sorting n-grams ===\n",
215
- "Reading /mnt/c/Projects/Speech/xls-R-finetuning/lm_te/text.txt\n",
216
  "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
217
  "****************************************************************************************************\n",
218
  "Unigram tokens 32852369 types 1308846\n",
219
  "=== 2/5 Calculating and sorting adjusted counts ===\n",
220
- "Chain sizes: 1:15706152 2:642680448 3:1205025920 4:1928041344 5:2811727104\n",
221
  "Statistics:\n",
222
  "1 1308845 D1=0.726852 D2=1.02775 D3+=1.30996\n",
223
  "2 12720239 D1=0.818931 D2=1.12897 D3+=1.32699\n",
@@ -243,40 +243,40 @@
243
  "=== 5/5 Writing ARPA model ===\n",
244
  "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
245
  "****************************************************************************************************\n",
246
- "Name:lmplz\tVmPeak:6620664 kB\tVmRSS:38084 kB\tRSSMax:2239444 kB\tuser:77.3579\tsys:28.8403\tCPU:106.198\treal:159.405\n"
247
  ]
248
  }
249
  ],
250
  "source": [
251
- "!../kenlm/build/bin/lmplz -o 5 <\"text.txt\" > \"5gram.arpa\""
252
  ]
253
  },
254
  {
255
  "cell_type": "code",
256
  "execution_count": null,
257
- "id": "4d4f8526-fb6a-40cc-bf02-75c78b4138cd",
258
  "metadata": {},
259
  "outputs": [],
260
  "source": []
261
  },
262
  {
263
  "cell_type": "code",
264
- "execution_count": 9,
265
- "id": "33e3c247-1b4b-4e61-a42e-283bef351c4b",
266
  "metadata": {},
267
  "outputs": [
268
  {
269
  "name": "stdout",
270
  "output_type": "stream",
271
  "text": [
272
- "CPU times: user 22.7 s, sys: 6.28 s, total: 28.9 s\n",
273
- "Wall time: 1min 29s\n"
274
  ]
275
  }
276
  ],
277
  "source": [
278
  "%%time\n",
279
- "with open(\"3gram.arpa\", \"r\") as read_file, open(\"3gram_correct.arpa\", \"w\") as write_file:\n",
280
  " has_added_eos = False\n",
281
  " for line in read_file:\n",
282
  " if not has_added_eos and \"ngram 1=\" in line:\n",
@@ -292,22 +292,22 @@
292
  },
293
  {
294
  "cell_type": "code",
295
- "execution_count": 10,
296
- "id": "0f8ead29-e478-48dd-ace5-46d787d3d68e",
297
  "metadata": {},
298
  "outputs": [
299
  {
300
  "name": "stdout",
301
  "output_type": "stream",
302
  "text": [
303
- "CPU times: user 1min 25s, sys: 27.2 s, total: 1min 52s\n",
304
- "Wall time: 5min 28s\n"
305
  ]
306
  }
307
  ],
308
  "source": [
309
  "%%time\n",
310
- "with open(\"5gram.arpa\", \"r\") as read_file, open(\"5gram_correct.arpa\", \"w\") as write_file:\n",
311
  " has_added_eos = False\n",
312
  " for line in read_file:\n",
313
  " if not has_added_eos and \"ngram 1=\" in line:\n",
@@ -324,7 +324,7 @@
324
  {
325
  "cell_type": "code",
326
  "execution_count": null,
327
- "id": "ad4ea204-d61c-4316-bc30-5bbda696d225",
328
  "metadata": {},
329
  "outputs": [],
330
  "source": []
@@ -332,7 +332,7 @@
332
  {
333
  "cell_type": "code",
334
  "execution_count": null,
335
- "id": "152fecfe-9a51-4f6d-9640-c810adb5e456",
336
  "metadata": {},
337
  "outputs": [],
338
  "source": []
@@ -340,7 +340,7 @@
340
  ],
341
  "metadata": {
342
  "kernelspec": {
343
- "display_name": "Python 3 (ipykernel)",
344
  "language": "python",
345
  "name": "python3"
346
  },
@@ -354,7 +354,7 @@
354
  "name": "python",
355
  "nbconvert_exporter": "python",
356
  "pygments_lexer": "ipython3",
357
- "version": "3.7.10"
358
  }
359
  },
360
  "nbformat": 4,
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "451d890e",
7
  "metadata": {},
8
  "outputs": [],
9
  "source": [
 
12
  },
13
  {
14
  "cell_type": "code",
15
+ "execution_count": 2,
16
+ "id": "eb0e4037",
17
  "metadata": {},
18
  "outputs": [
19
  {
20
  "data": {
21
  "application/vnd.jupyter.widget-view+json": {
22
+ "model_id": "3451cb7648e349cbbbdea3b672207ef7",
23
  "version_major": 2,
24
  "version_minor": 0
25
  },
 
34
  "name": "stderr",
35
  "output_type": "stream",
36
  "text": [
37
+ "Using custom data configuration chmanoj--ai4bharat__samanantar_processed_te-ec4e27c180ab4035\n"
38
  ]
39
  },
40
  {
41
  "name": "stdout",
42
  "output_type": "stream",
43
  "text": [
44
+ "Downloading and preparing dataset samanantar/te (download: 292.93 MiB, generated: 678.62 MiB, post-processed: Unknown size, total: 971.55 MiB) to /workspace/cache/hf/datasets/parquet/chmanoj--ai4bharat__samanantar_processed_te-ec4e27c180ab4035/0.0.0/1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121...\n"
45
  ]
46
  },
47
  {
48
  "data": {
49
  "application/vnd.jupyter.widget-view+json": {
50
+ "model_id": "68ea006ea9b943c3af2ed5ee7bb9fffb",
51
  "version_major": 2,
52
  "version_minor": 0
53
  },
 
61
  {
62
  "data": {
63
  "application/vnd.jupyter.widget-view+json": {
64
+ "model_id": "b5276db8e4614107ad0bdfe67ccca2fd",
65
  "version_major": 2,
66
  "version_minor": 0
67
  },
 
75
  {
76
  "data": {
77
  "application/vnd.jupyter.widget-view+json": {
78
+ "model_id": "0d3e27b107e7401dbe7f5dad8aa7ec08",
79
  "version_major": 2,
80
  "version_minor": 0
81
  },
 
89
  {
90
  "data": {
91
  "application/vnd.jupyter.widget-view+json": {
92
+ "model_id": "ead9e8fde9a842b295955332ecae540d",
93
  "version_major": 2,
94
  "version_minor": 0
95
  },
 
104
  "name": "stdout",
105
  "output_type": "stream",
106
  "text": [
107
+ "Dataset parquet downloaded and prepared to /workspace/cache/hf/datasets/parquet/chmanoj--ai4bharat__samanantar_processed_te-ec4e27c180ab4035/0.0.0/1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121. Subsequent calls will reuse this data.\n"
108
  ]
109
  }
110
  ],
 
115
  {
116
  "cell_type": "code",
117
  "execution_count": 4,
118
+ "id": "e4f4f4e8",
119
  "metadata": {},
120
  "outputs": [],
121
  "source": [
122
+ "with open(\"kenlm_text_te.txt\", \"w\") as file:\n",
123
  " file.write(\" \".join(dataset[\"text\"]))"
124
  ]
125
  },
126
  {
127
  "cell_type": "code",
128
  "execution_count": null,
129
+ "id": "6e8a0e84",
130
  "metadata": {},
131
  "outputs": [],
132
  "source": []
 
134
  {
135
  "cell_type": "code",
136
  "execution_count": 5,
137
+ "id": "5dfbf3e1",
138
  "metadata": {},
139
  "outputs": [
140
  {
141
  "data": {
142
  "text/plain": [
143
+ "'/workspace/kenlm_te/src'"
144
  ]
145
  },
146
  "execution_count": 5,
 
155
  },
156
  {
157
  "cell_type": "code",
158
+ "execution_count": 8,
159
+ "id": "494bec1a",
160
  "metadata": {},
161
  "outputs": [
162
  {
 
164
  "output_type": "stream",
165
  "text": [
166
  "=== 1/5 Counting and sorting n-grams ===\n",
167
+ "Reading /workspace/kenlm_te/src/kenlm_text_te.txt\n",
168
  "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
169
  "****************************************************************************************************\n",
170
  "Unigram tokens 32852369 types 1308846\n",
171
  "=== 2/5 Calculating and sorting adjusted counts ===\n",
172
+ "Chain sizes: 1:15706152 2:51606089728 3:96761421824\n",
173
  "Statistics:\n",
174
  "1 1308845 D1=0.726852 D2=1.02775 D3+=1.30996\n",
175
  "2 12720239 D1=0.818931 D2=1.12897 D3+=1.32699\n",
 
193
  "=== 5/5 Writing ARPA model ===\n",
194
  "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
195
  "****************************************************************************************************\n",
196
+ "Name:lmplz\tVmPeak:145080616 kB\tVmRSS:38292 kB\tRSSMax:33928732 kB\tuser:43.6485\tsys:27.5682\tCPU:71.2168\treal:64.983\n"
197
  ]
198
  }
199
  ],
200
  "source": [
201
+ "!../../kenlm/build/bin/lmplz -o 3 <\"kenlm_text_te.txt\" > \"../3gram.arpa\""
202
  ]
203
  },
204
  {
205
  "cell_type": "code",
206
+ "execution_count": 9,
207
+ "id": "c2c8c8ce",
208
  "metadata": {},
209
  "outputs": [
210
  {
 
212
  "output_type": "stream",
213
  "text": [
214
  "=== 1/5 Counting and sorting n-grams ===\n",
215
+ "Reading /workspace/kenlm_te/src/kenlm_text_te.txt\n",
216
  "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
217
  "****************************************************************************************************\n",
218
  "Unigram tokens 32852369 types 1308846\n",
219
  "=== 2/5 Calculating and sorting adjusted counts ===\n",
220
+ "Chain sizes: 1:15706152 2:14474877952 3:27140399104 4:43424632832 5:63327596544\n",
221
  "Statistics:\n",
222
  "1 1308845 D1=0.726852 D2=1.02775 D3+=1.30996\n",
223
  "2 12720239 D1=0.818931 D2=1.12897 D3+=1.32699\n",
 
243
  "=== 5/5 Writing ARPA model ===\n",
244
  "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
245
  "****************************************************************************************************\n",
246
+ "Name:lmplz\tVmPeak:145104204 kB\tVmRSS:38296 kB\tRSSMax:26419104 kB\tuser:89.0779\tsys:42.0565\tCPU:131.134\treal:97.4678\n"
247
  ]
248
  }
249
  ],
250
  "source": [
251
+ "!../../kenlm/build/bin/lmplz -o 5 <\"kenlm_text_te.txt\" > \"../5gram.arpa\""
252
  ]
253
  },
254
  {
255
  "cell_type": "code",
256
  "execution_count": null,
257
+ "id": "62b727b7",
258
  "metadata": {},
259
  "outputs": [],
260
  "source": []
261
  },
262
  {
263
  "cell_type": "code",
264
+ "execution_count": 10,
265
+ "id": "c27f1ef3",
266
  "metadata": {},
267
  "outputs": [
268
  {
269
  "name": "stdout",
270
  "output_type": "stream",
271
  "text": [
272
+ "CPU times: user 19.1 s, sys: 3.81 s, total: 22.9 s\n",
273
+ "Wall time: 22.9 s\n"
274
  ]
275
  }
276
  ],
277
  "source": [
278
  "%%time\n",
279
+ "with open(\"../3gram.arpa\", \"r\") as read_file, open(\"../3gram_correct.arpa\", \"w\") as write_file:\n",
280
  " has_added_eos = False\n",
281
  " for line in read_file:\n",
282
  " if not has_added_eos and \"ngram 1=\" in line:\n",
 
292
  },
293
  {
294
  "cell_type": "code",
295
+ "execution_count": 11,
296
+ "id": "8c8d963b",
297
  "metadata": {},
298
  "outputs": [
299
  {
300
  "name": "stdout",
301
  "output_type": "stream",
302
  "text": [
303
+ "CPU times: user 1min 5s, sys: 12.8 s, total: 1min 18s\n",
304
+ "Wall time: 1min 18s\n"
305
  ]
306
  }
307
  ],
308
  "source": [
309
  "%%time\n",
310
+ "with open(\"../5gram.arpa\", \"r\") as read_file, open(\"../5gram_correct.arpa\", \"w\") as write_file:\n",
311
  " has_added_eos = False\n",
312
  " for line in read_file:\n",
313
  " if not has_added_eos and \"ngram 1=\" in line:\n",
 
324
  {
325
  "cell_type": "code",
326
  "execution_count": null,
327
+ "id": "9447691c",
328
  "metadata": {},
329
  "outputs": [],
330
  "source": []
 
332
  {
333
  "cell_type": "code",
334
  "execution_count": null,
335
+ "id": "95d50071",
336
  "metadata": {},
337
  "outputs": [],
338
  "source": []
 
340
  ],
341
  "metadata": {
342
  "kernelspec": {
343
+ "display_name": "Python 3",
344
  "language": "python",
345
  "name": "python3"
346
  },
 
354
  "name": "python",
355
  "nbconvert_exporter": "python",
356
  "pygments_lexer": "ipython3",
357
+ "version": "3.8.8"
358
  }
359
  },
360
  "nbformat": 4,
src/{text.txt → kenlm_text_te.txt} RENAMED
File without changes