chmanoj commited on
Commit
2ea8870
1 Parent(s): edc73dd

Add kenLM notebooks

Browse files
.gitattributes CHANGED
@@ -1,5 +1,6 @@
1
  *.7z filter=lfs diff=lfs merge=lfs -text
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
 
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
  *.bin.* filter=lfs diff=lfs merge=lfs -text
5
  *.bz2 filter=lfs diff=lfs merge=lfs -text
 
1
  *.7z filter=lfs diff=lfs merge=lfs -text
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ text.txt filter=lfs diff=lfs merge=lfs -text
4
  *.bin filter=lfs diff=lfs merge=lfs -text
5
  *.bin.* filter=lfs diff=lfs merge=lfs -text
6
  *.bz2 filter=lfs diff=lfs merge=lfs -text
src/Create_LM.ipynb ADDED
@@ -0,0 +1,362 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "id": "5e445ce4-1507-482d-a2a8-03d8802e6311",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "from datasets import load_dataset"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 3,
16
+ "id": "1c1820bc-0125-4589-983f-e454801435a5",
17
+ "metadata": {},
18
+ "outputs": [
19
+ {
20
+ "data": {
21
+ "application/vnd.jupyter.widget-view+json": {
22
+ "model_id": "117e880c8ae8437e9a16ccdf20b659eb",
23
+ "version_major": 2,
24
+ "version_minor": 0
25
+ },
26
+ "text/plain": [
27
+ "Downloading: 0%| | 0.00/1.68k [00:00<?, ?B/s]"
28
+ ]
29
+ },
30
+ "metadata": {},
31
+ "output_type": "display_data"
32
+ },
33
+ {
34
+ "name": "stderr",
35
+ "output_type": "stream",
36
+ "text": [
37
+ "Using custom data configuration chmanoj--ai4bharat__samanantar_processed_te-a0473fa2e2573d48\n"
38
+ ]
39
+ },
40
+ {
41
+ "name": "stdout",
42
+ "output_type": "stream",
43
+ "text": [
44
+ "Downloading and preparing dataset samanantar/te (download: 292.93 MiB, generated: 678.62 MiB, post-processed: Unknown size, total: 971.55 MiB) to /home/manoj/.cache/huggingface/datasets/parquet/chmanoj--ai4bharat__samanantar_processed_te-a0473fa2e2573d48/0.0.0/1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121...\n"
45
+ ]
46
+ },
47
+ {
48
+ "data": {
49
+ "application/vnd.jupyter.widget-view+json": {
50
+ "model_id": "bd1bfffa9a424a45b3b7324458818f4a",
51
+ "version_major": 2,
52
+ "version_minor": 0
53
+ },
54
+ "text/plain": [
55
+ " 0%| | 0/1 [00:00<?, ?it/s]"
56
+ ]
57
+ },
58
+ "metadata": {},
59
+ "output_type": "display_data"
60
+ },
61
+ {
62
+ "data": {
63
+ "application/vnd.jupyter.widget-view+json": {
64
+ "model_id": "22a24004a7a546ea88bf7c3fe1c16e46",
65
+ "version_major": 2,
66
+ "version_minor": 0
67
+ },
68
+ "text/plain": [
69
+ "Downloading: 0%| | 0.00/151M [00:00<?, ?B/s]"
70
+ ]
71
+ },
72
+ "metadata": {},
73
+ "output_type": "display_data"
74
+ },
75
+ {
76
+ "data": {
77
+ "application/vnd.jupyter.widget-view+json": {
78
+ "model_id": "9e4a161541734dfbb2de2d3dd46d8753",
79
+ "version_major": 2,
80
+ "version_minor": 0
81
+ },
82
+ "text/plain": [
83
+ "Downloading: 0%| | 0.00/156M [00:00<?, ?B/s]"
84
+ ]
85
+ },
86
+ "metadata": {},
87
+ "output_type": "display_data"
88
+ },
89
+ {
90
+ "data": {
91
+ "application/vnd.jupyter.widget-view+json": {
92
+ "model_id": "992db97134c94b9284b421c7f3ea0b33",
93
+ "version_major": 2,
94
+ "version_minor": 0
95
+ },
96
+ "text/plain": [
97
+ " 0%| | 0/1 [00:00<?, ?it/s]"
98
+ ]
99
+ },
100
+ "metadata": {},
101
+ "output_type": "display_data"
102
+ },
103
+ {
104
+ "name": "stdout",
105
+ "output_type": "stream",
106
+ "text": [
107
+ "Dataset parquet downloaded and prepared to /home/manoj/.cache/huggingface/datasets/parquet/chmanoj--ai4bharat__samanantar_processed_te-a0473fa2e2573d48/0.0.0/1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121. Subsequent calls will reuse this data.\n"
108
+ ]
109
+ }
110
+ ],
111
+ "source": [
112
+ "dataset = load_dataset(f\"chmanoj/ai4bharat__samanantar_processed_te\", split=\"train\")"
113
+ ]
114
+ },
115
+ {
116
+ "cell_type": "code",
117
+ "execution_count": 4,
118
+ "id": "62fb01f7-24fe-4384-9940-3c262c321a5d",
119
+ "metadata": {},
120
+ "outputs": [],
121
+ "source": [
122
+ "with open(\"text.txt\", \"w\") as file:\n",
123
+ " file.write(\" \".join(dataset[\"text\"]))"
124
+ ]
125
+ },
126
+ {
127
+ "cell_type": "code",
128
+ "execution_count": null,
129
+ "id": "4295ab4b-b4d8-4a39-a896-fb86503e4674",
130
+ "metadata": {},
131
+ "outputs": [],
132
+ "source": []
133
+ },
134
+ {
135
+ "cell_type": "code",
136
+ "execution_count": 5,
137
+ "id": "fcc0b573-516a-45d6-af2a-feace521c16d",
138
+ "metadata": {},
139
+ "outputs": [
140
+ {
141
+ "data": {
142
+ "text/plain": [
143
+ "'/mnt/c/Projects/Speech/xls-R-finetuning/lm_te'"
144
+ ]
145
+ },
146
+ "execution_count": 5,
147
+ "metadata": {},
148
+ "output_type": "execute_result"
149
+ }
150
+ ],
151
+ "source": [
152
+ "import os\n",
153
+ "os.getcwd()"
154
+ ]
155
+ },
156
+ {
157
+ "cell_type": "code",
158
+ "execution_count": 6,
159
+ "id": "e1f8f887-6201-4ae0-989e-8bdc57816db1",
160
+ "metadata": {},
161
+ "outputs": [
162
+ {
163
+ "name": "stdout",
164
+ "output_type": "stream",
165
+ "text": [
166
+ "=== 1/5 Counting and sorting n-grams ===\n",
167
+ "Reading /mnt/c/Projects/Speech/xls-R-finetuning/lm_te/text.txt\n",
168
+ "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
169
+ "****************************************************************************************************\n",
170
+ "Unigram tokens 32852369 types 1308846\n",
171
+ "=== 2/5 Calculating and sorting adjusted counts ===\n",
172
+ "Chain sizes: 1:15706152 2:2291295744 3:4296179712\n",
173
+ "Statistics:\n",
174
+ "1 1308845 D1=0.726852 D2=1.02775 D3+=1.30996\n",
175
+ "2 12720239 D1=0.818931 D2=1.12897 D3+=1.32699\n",
176
+ "3 23789023 D1=0.823705 D2=1.50814 D3+=1.24837\n",
177
+ "Memory estimate for binary LM:\n",
178
+ "type MB\n",
179
+ "probing 731 assuming -p 1.5\n",
180
+ "probing 809 assuming -r models -p 1.5\n",
181
+ "trie 342 without quantization\n",
182
+ "trie 206 assuming -q 8 -b 8 quantization \n",
183
+ "trie 316 assuming -a 22 array pointer compression\n",
184
+ "trie 180 assuming -a 22 -q 8 -b 8 array pointer compression and quantization\n",
185
+ "=== 3/5 Calculating and sorting initial probabilities ===\n",
186
+ "Chain sizes: 1:15706140 2:203523824 3:475780460\n",
187
+ "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
188
+ "####################################################################################################\n",
189
+ "=== 4/5 Calculating and writing order-interpolated probabilities ===\n",
190
+ "Chain sizes: 1:15706140 2:203523824 3:475780460\n",
191
+ "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
192
+ "####################################################################################################\n",
193
+ "=== 5/5 Writing ARPA model ===\n",
194
+ "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
195
+ "****************************************************************************************************\n",
196
+ "Name:lmplz\tVmPeak:6613460 kB\tVmRSS:37976 kB\tRSSMax:1975488 kB\tuser:33.1964\tsys:9.29228\tCPU:42.4891\treal:65.5831\n"
197
+ ]
198
+ }
199
+ ],
200
+ "source": [
201
+ "!../kenlm/build/bin/lmplz -o 3 <\"text.txt\" > \"3gram.arpa\""
202
+ ]
203
+ },
204
+ {
205
+ "cell_type": "code",
206
+ "execution_count": null,
207
+ "id": "afee7f94-f247-4891-822e-1f4edd5abc81",
208
+ "metadata": {},
209
+ "outputs": [
210
+ {
211
+ "name": "stdout",
212
+ "output_type": "stream",
213
+ "text": [
214
+ "=== 1/5 Counting and sorting n-grams ===\n",
215
+ "Reading /mnt/c/Projects/Speech/xls-R-finetuning/lm_te/text.txt\n",
216
+ "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
217
+ "****************************************************************************************************\n",
218
+ "Unigram tokens 32852369 types 1308846\n",
219
+ "=== 2/5 Calculating and sorting adjusted counts ===\n",
220
+ "Chain sizes: 1:15706152 2:642680448 3:1205025920 4:1928041344 5:2811727104\n",
221
+ "Statistics:\n",
222
+ "1 1308845 D1=0.726852 D2=1.02775 D3+=1.30996\n",
223
+ "2 12720239 D1=0.818931 D2=1.12897 D3+=1.32699\n",
224
+ "3 23789023 D1=0.910002 D2=1.27136 D3+=1.38596\n",
225
+ "4 28332665 D1=0.955371 D2=1.42566 D3+=1.4677\n",
226
+ "5 30063763 D1=0.898851 D2=1.71714 D3+=1.29889\n",
227
+ "Memory estimate for binary LM:\n",
228
+ "type MB\n",
229
+ "probing 2032 assuming -p 1.5\n",
230
+ "probing 2408 assuming -r models -p 1.5\n",
231
+ "trie 1058 without quantization\n",
232
+ "trie 613 assuming -q 8 -b 8 quantization \n",
233
+ "trie 921 assuming -a 22 array pointer compression\n",
234
+ "trie 476 assuming -a 22 -q 8 -b 8 array pointer compression and quantization\n",
235
+ "=== 3/5 Calculating and sorting initial probabilities ===\n",
236
+ "Chain sizes: 1:15706140 2:203523824 3:475780460 4:679983960 5:841785364\n",
237
+ "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
238
+ "####################################################################################################\n",
239
+ "=== 4/5 Calculating and writing order-interpolated probabilities ===\n",
240
+ "Chain sizes: 1:15706140 2:203523824 3:475780460 4:679983960 5:841785364\n",
241
+ "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
242
+ "####################################################################################################\n",
243
+ "=== 5/5 Writing ARPA model ===\n",
244
+ "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
245
+ "****************************************************************************************************\n",
246
+ "Name:lmplz\tVmPeak:6620664 kB\tVmRSS:38084 kB\tRSSMax:2239444 kB\tuser:77.3579\tsys:28.8403\tCPU:106.198\treal:159.405\n"
247
+ ]
248
+ }
249
+ ],
250
+ "source": [
251
+ "!../kenlm/build/bin/lmplz -o 5 <\"text.txt\" > \"5gram.arpa\""
252
+ ]
253
+ },
254
+ {
255
+ "cell_type": "code",
256
+ "execution_count": null,
257
+ "id": "4d4f8526-fb6a-40cc-bf02-75c78b4138cd",
258
+ "metadata": {},
259
+ "outputs": [],
260
+ "source": []
261
+ },
262
+ {
263
+ "cell_type": "code",
264
+ "execution_count": 9,
265
+ "id": "33e3c247-1b4b-4e61-a42e-283bef351c4b",
266
+ "metadata": {},
267
+ "outputs": [
268
+ {
269
+ "name": "stdout",
270
+ "output_type": "stream",
271
+ "text": [
272
+ "CPU times: user 22.7 s, sys: 6.28 s, total: 28.9 s\n",
273
+ "Wall time: 1min 29s\n"
274
+ ]
275
+ }
276
+ ],
277
+ "source": [
278
+ "%%time\n",
279
+ "with open(\"3gram.arpa\", \"r\") as read_file, open(\"3gram_correct.arpa\", \"w\") as write_file:\n",
280
+ " has_added_eos = False\n",
281
+ " for line in read_file:\n",
282
+ " if not has_added_eos and \"ngram 1=\" in line:\n",
283
+ " count=line.strip().split(\"=\")[-1]\n",
284
+ " write_file.write(line.replace(f\"{count}\", f\"{int(count)+1}\"))\n",
285
+ " elif not has_added_eos and \"<s>\" in line:\n",
286
+ " write_file.write(line)\n",
287
+ " write_file.write(line.replace(\"<s>\", \"</s>\"))\n",
288
+ " has_added_eos = True\n",
289
+ " else:\n",
290
+ " write_file.write(line)"
291
+ ]
292
+ },
293
+ {
294
+ "cell_type": "code",
295
+ "execution_count": 10,
296
+ "id": "0f8ead29-e478-48dd-ace5-46d787d3d68e",
297
+ "metadata": {},
298
+ "outputs": [
299
+ {
300
+ "name": "stdout",
301
+ "output_type": "stream",
302
+ "text": [
303
+ "CPU times: user 1min 25s, sys: 27.2 s, total: 1min 52s\n",
304
+ "Wall time: 5min 28s\n"
305
+ ]
306
+ }
307
+ ],
308
+ "source": [
309
+ "%%time\n",
310
+ "with open(\"5gram.arpa\", \"r\") as read_file, open(\"5gram_correct.arpa\", \"w\") as write_file:\n",
311
+ " has_added_eos = False\n",
312
+ " for line in read_file:\n",
313
+ " if not has_added_eos and \"ngram 1=\" in line:\n",
314
+ " count=line.strip().split(\"=\")[-1]\n",
315
+ " write_file.write(line.replace(f\"{count}\", f\"{int(count)+1}\"))\n",
316
+ " elif not has_added_eos and \"<s>\" in line:\n",
317
+ " write_file.write(line)\n",
318
+ " write_file.write(line.replace(\"<s>\", \"</s>\"))\n",
319
+ " has_added_eos = True\n",
320
+ " else:\n",
321
+ " write_file.write(line)"
322
+ ]
323
+ },
324
+ {
325
+ "cell_type": "code",
326
+ "execution_count": null,
327
+ "id": "ad4ea204-d61c-4316-bc30-5bbda696d225",
328
+ "metadata": {},
329
+ "outputs": [],
330
+ "source": []
331
+ },
332
+ {
333
+ "cell_type": "code",
334
+ "execution_count": null,
335
+ "id": "152fecfe-9a51-4f6d-9640-c810adb5e456",
336
+ "metadata": {},
337
+ "outputs": [],
338
+ "source": []
339
+ }
340
+ ],
341
+ "metadata": {
342
+ "kernelspec": {
343
+ "display_name": "Python 3 (ipykernel)",
344
+ "language": "python",
345
+ "name": "python3"
346
+ },
347
+ "language_info": {
348
+ "codemirror_mode": {
349
+ "name": "ipython",
350
+ "version": 3
351
+ },
352
+ "file_extension": ".py",
353
+ "mimetype": "text/x-python",
354
+ "name": "python",
355
+ "nbconvert_exporter": "python",
356
+ "pygments_lexer": "ipython3",
357
+ "version": "3.7.10"
358
+ }
359
+ },
360
+ "nbformat": 4,
361
+ "nbformat_minor": 5
362
+ }
src/Create_dataset_te.ipynb ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 4,
6
+ "id": "3a55acf6",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "data": {
11
+ "text/plain": [
12
+ "'/workspace/xls-r-300m-te'"
13
+ ]
14
+ },
15
+ "execution_count": 4,
16
+ "metadata": {},
17
+ "output_type": "execute_result"
18
+ }
19
+ ],
20
+ "source": [
21
+ "import os\n",
22
+ "os.getcwd()"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "code",
27
+ "execution_count": 6,
28
+ "id": "8491f5f9",
29
+ "metadata": {},
30
+ "outputs": [],
31
+ "source": [
32
+ "from datasets import load_dataset"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": 7,
38
+ "id": "fed9879a",
39
+ "metadata": {},
40
+ "outputs": [
41
+ {
42
+ "data": {
43
+ "application/vnd.jupyter.widget-view+json": {
44
+ "model_id": "dc35d55b7a9444128bb348a38969453f",
45
+ "version_major": 2,
46
+ "version_minor": 0
47
+ },
48
+ "text/plain": [
49
+ "Downloading: 0%| | 0.00/3.92k [00:00<?, ?B/s]"
50
+ ]
51
+ },
52
+ "metadata": {},
53
+ "output_type": "display_data"
54
+ },
55
+ {
56
+ "name": "stdout",
57
+ "output_type": "stream",
58
+ "text": [
59
+ "Downloading and preparing dataset samanantar/te to /workspace/.cache/huggingface/datasets/ai4bharat___samanantar/te/0.3.0/556308f80c011cb3c32f3de18199d7b1e4cf9ca707843c92bb0bede0e47a8bd6...\n"
60
+ ]
61
+ },
62
+ {
63
+ "data": {
64
+ "application/vnd.jupyter.widget-view+json": {
65
+ "model_id": "de6defc6eb934d87ab8a18cd4fe2a04d",
66
+ "version_major": 2,
67
+ "version_minor": 0
68
+ },
69
+ "text/plain": [
70
+ "Downloading: 0%| | 0.00/4.60G [00:00<?, ?B/s]"
71
+ ]
72
+ },
73
+ "metadata": {},
74
+ "output_type": "display_data"
75
+ },
76
+ {
77
+ "data": {
78
+ "application/vnd.jupyter.widget-view+json": {
79
+ "model_id": "",
80
+ "version_major": 2,
81
+ "version_minor": 0
82
+ },
83
+ "text/plain": [
84
+ "0 examples [00:00, ? examples/s]"
85
+ ]
86
+ },
87
+ "metadata": {},
88
+ "output_type": "display_data"
89
+ },
90
+ {
91
+ "name": "stdout",
92
+ "output_type": "stream",
93
+ "text": [
94
+ "Dataset samanantar downloaded and prepared to /workspace/.cache/huggingface/datasets/ai4bharat___samanantar/te/0.3.0/556308f80c011cb3c32f3de18199d7b1e4cf9ca707843c92bb0bede0e47a8bd6. Subsequent calls will reuse this data.\n"
95
+ ]
96
+ }
97
+ ],
98
+ "source": [
99
+ "dataset = load_dataset(\"ai4bharat/samanantar\", \"te\", split=\"train\")"
100
+ ]
101
+ },
102
+ {
103
+ "cell_type": "code",
104
+ "execution_count": 8,
105
+ "id": "5c478941",
106
+ "metadata": {},
107
+ "outputs": [],
108
+ "source": [
109
+ "chars_to_ignore_regex = '[,?.!\\-\\;\\:\"“%‘”�—’…–]'"
110
+ ]
111
+ },
112
+ {
113
+ "cell_type": "code",
114
+ "execution_count": 17,
115
+ "id": "abf69ac9",
116
+ "metadata": {},
117
+ "outputs": [],
118
+ "source": [
119
+ "import re\n",
120
+ "\n",
121
+ "def extract_text(batch):\n",
122
+ " text = batch[\"tgt\"]\n",
123
+ " batch[\"text\"] = re.sub(chars_to_ignore_regex, \"\", text.lower())\n",
124
+ " return batch"
125
+ ]
126
+ },
127
+ {
128
+ "cell_type": "code",
129
+ "execution_count": 16,
130
+ "id": "6b4d0c6c",
131
+ "metadata": {},
132
+ "outputs": [
133
+ {
134
+ "data": {
135
+ "text/plain": [
136
+ "'వర్షాలకు చేతికి వచ్చిన పంట దెబ్బతిన్నదని రైతులు వాపోతున్నారు'"
137
+ ]
138
+ },
139
+ "execution_count": 16,
140
+ "metadata": {},
141
+ "output_type": "execute_result"
142
+ }
143
+ ],
144
+ "source": [
145
+ "dataset[0]['tgt']"
146
+ ]
147
+ },
148
+ {
149
+ "cell_type": "code",
150
+ "execution_count": 18,
151
+ "id": "710de6ce",
152
+ "metadata": {},
153
+ "outputs": [
154
+ {
155
+ "data": {
156
+ "application/vnd.jupyter.widget-view+json": {
157
+ "model_id": "cc51f1d8191c4118b9281727e6ec4b63",
158
+ "version_major": 2,
159
+ "version_minor": 0
160
+ },
161
+ "text/plain": [
162
+ " 0%| | 0/4661986 [00:00<?, ?ex/s]"
163
+ ]
164
+ },
165
+ "metadata": {},
166
+ "output_type": "display_data"
167
+ }
168
+ ],
169
+ "source": [
170
+ "dataset = dataset.map(extract_text, remove_columns=dataset.column_names)"
171
+ ]
172
+ },
173
+ {
174
+ "cell_type": "code",
175
+ "execution_count": 19,
176
+ "id": "bd4c05b4",
177
+ "metadata": {},
178
+ "outputs": [
179
+ {
180
+ "data": {
181
+ "application/vnd.jupyter.widget-view+json": {
182
+ "model_id": "a0a50384591d42489963b8990624ab95",
183
+ "version_major": 2,
184
+ "version_minor": 0
185
+ },
186
+ "text/plain": [
187
+ "VBox(children=(HTML(value='<center>\\n<img src=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
188
+ ]
189
+ },
190
+ "metadata": {},
191
+ "output_type": "display_data"
192
+ }
193
+ ],
194
+ "source": [
195
+ "from huggingface_hub import notebook_login\n",
196
+ "\n",
197
+ "notebook_login()"
198
+ ]
199
+ },
200
+ {
201
+ "cell_type": "code",
202
+ "execution_count": 20,
203
+ "id": "791becc3",
204
+ "metadata": {},
205
+ "outputs": [
206
+ {
207
+ "data": {
208
+ "application/vnd.jupyter.widget-view+json": {
209
+ "model_id": "02abddfd320f404ba7970f6208f9cc27",
210
+ "version_major": 2,
211
+ "version_minor": 0
212
+ },
213
+ "text/plain": [
214
+ "Pushing dataset shards to the dataset hub: 0%| | 0/2 [00:00<?, ?it/s]"
215
+ ]
216
+ },
217
+ "metadata": {},
218
+ "output_type": "display_data"
219
+ }
220
+ ],
221
+ "source": [
222
+ "dataset.push_to_hub(f\"ai4bharat__samanantar_processed_te\", split=\"train\")"
223
+ ]
224
+ },
225
+ {
226
+ "cell_type": "code",
227
+ "execution_count": null,
228
+ "id": "2d34464c",
229
+ "metadata": {},
230
+ "outputs": [],
231
+ "source": []
232
+ },
233
+ {
234
+ "cell_type": "code",
235
+ "execution_count": null,
236
+ "id": "2d8308be",
237
+ "metadata": {},
238
+ "outputs": [],
239
+ "source": []
240
+ }
241
+ ],
242
+ "metadata": {
243
+ "kernelspec": {
244
+ "display_name": "Python 3",
245
+ "language": "python",
246
+ "name": "python3"
247
+ },
248
+ "language_info": {
249
+ "codemirror_mode": {
250
+ "name": "ipython",
251
+ "version": 3
252
+ },
253
+ "file_extension": ".py",
254
+ "mimetype": "text/x-python",
255
+ "name": "python",
256
+ "nbconvert_exporter": "python",
257
+ "pygments_lexer": "ipython3",
258
+ "version": "3.8.8"
259
+ }
260
+ },
261
+ "nbformat": 4,
262
+ "nbformat_minor": 5
263
+ }
src/text.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:576b4cccf2cd0a29d989ba3823293e051ded9cc2dd8b70356923a3557a691bb1
3
+ size 697581014