File size: 11,883 Bytes
2ea8870
 
 
 
c82a77a
 
2ea8870
 
 
 
 
 
 
 
c82a77a
 
2ea8870
 
 
 
 
c82a77a
2ea8870
 
 
 
 
 
 
 
 
 
 
 
 
 
c82a77a
2ea8870
 
 
 
 
 
c82a77a
2ea8870
 
 
 
 
c82a77a
2ea8870
 
 
 
 
 
 
 
 
 
 
 
 
c82a77a
2ea8870
 
 
 
 
 
 
 
 
 
 
 
 
c82a77a
2ea8870
 
 
 
 
 
 
 
 
 
 
 
 
c82a77a
2ea8870
 
 
 
 
 
 
 
 
 
 
 
 
 
c82a77a
2ea8870
 
 
 
 
 
 
 
 
 
c82a77a
2ea8870
 
 
c82a77a
2ea8870
 
 
 
 
 
c82a77a
2ea8870
 
 
 
 
 
 
c82a77a
2ea8870
 
 
 
 
c82a77a
2ea8870
 
 
 
 
 
 
 
 
 
 
 
 
 
c82a77a
 
2ea8870
 
 
 
 
 
 
c82a77a
2ea8870
 
 
 
c82a77a
2ea8870
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c82a77a
2ea8870
 
 
 
c82a77a
2ea8870
 
 
 
c82a77a
 
2ea8870
 
 
 
 
 
 
c82a77a
2ea8870
 
 
 
c82a77a
2ea8870
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c82a77a
2ea8870
 
 
 
c82a77a
2ea8870
 
 
 
 
c82a77a
2ea8870
 
 
 
 
 
c82a77a
 
2ea8870
 
 
 
 
 
c82a77a
 
2ea8870
 
 
 
 
c82a77a
2ea8870
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c82a77a
 
2ea8870
 
 
 
 
 
c82a77a
 
2ea8870
 
 
 
 
c82a77a
2ea8870
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c82a77a
2ea8870
 
 
 
 
 
 
c82a77a
2ea8870
 
 
 
 
 
 
c82a77a
2ea8870
 
 
 
 
 
 
 
 
 
 
 
 
c82a77a
2ea8870
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "451d890e",
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "eb0e4037",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3451cb7648e349cbbbdea3b672207ef7",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading:   0%|          | 0.00/1.68k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using custom data configuration chmanoj--ai4bharat__samanantar_processed_te-ec4e27c180ab4035\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading and preparing dataset samanantar/te (download: 292.93 MiB, generated: 678.62 MiB, post-processed: Unknown size, total: 971.55 MiB) to /workspace/cache/hf/datasets/parquet/chmanoj--ai4bharat__samanantar_processed_te-ec4e27c180ab4035/0.0.0/1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "68ea006ea9b943c3af2ed5ee7bb9fffb",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b5276db8e4614107ad0bdfe67ccca2fd",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading:   0%|          | 0.00/151M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0d3e27b107e7401dbe7f5dad8aa7ec08",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading:   0%|          | 0.00/156M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ead9e8fde9a842b295955332ecae540d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset parquet downloaded and prepared to /workspace/cache/hf/datasets/parquet/chmanoj--ai4bharat__samanantar_processed_te-ec4e27c180ab4035/0.0.0/1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121. Subsequent calls will reuse this data.\n"
     ]
    }
   ],
   "source": [
    "dataset = load_dataset(f\"chmanoj/ai4bharat__samanantar_processed_te\", split=\"train\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "e4f4f4e8",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"kenlm_text_te.txt\", \"w\") as file:\n",
    "  file.write(\" \".join(dataset[\"text\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6e8a0e84",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "5dfbf3e1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'/workspace/kenlm_te/src'"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import os\n",
    "os.getcwd()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "494bec1a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=== 1/5 Counting and sorting n-grams ===\n",
      "Reading /workspace/kenlm_te/src/kenlm_text_te.txt\n",
      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
      "****************************************************************************************************\n",
      "Unigram tokens 32852369 types 1308846\n",
      "=== 2/5 Calculating and sorting adjusted counts ===\n",
      "Chain sizes: 1:15706152 2:51606089728 3:96761421824\n",
      "Statistics:\n",
      "1 1308845 D1=0.726852 D2=1.02775 D3+=1.30996\n",
      "2 12720239 D1=0.818931 D2=1.12897 D3+=1.32699\n",
      "3 23789023 D1=0.823705 D2=1.50814 D3+=1.24837\n",
      "Memory estimate for binary LM:\n",
      "type     MB\n",
      "probing 731 assuming -p 1.5\n",
      "probing 809 assuming -r models -p 1.5\n",
      "trie    342 without quantization\n",
      "trie    206 assuming -q 8 -b 8 quantization \n",
      "trie    316 assuming -a 22 array pointer compression\n",
      "trie    180 assuming -a 22 -q 8 -b 8 array pointer compression and quantization\n",
      "=== 3/5 Calculating and sorting initial probabilities ===\n",
      "Chain sizes: 1:15706140 2:203523824 3:475780460\n",
      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
      "####################################################################################################\n",
      "=== 4/5 Calculating and writing order-interpolated probabilities ===\n",
      "Chain sizes: 1:15706140 2:203523824 3:475780460\n",
      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
      "####################################################################################################\n",
      "=== 5/5 Writing ARPA model ===\n",
      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
      "****************************************************************************************************\n",
      "Name:lmplz\tVmPeak:145080616 kB\tVmRSS:38292 kB\tRSSMax:33928732 kB\tuser:43.6485\tsys:27.5682\tCPU:71.2168\treal:64.983\n"
     ]
    }
   ],
   "source": [
    "!../../kenlm/build/bin/lmplz -o 3 <\"kenlm_text_te.txt\" > \"../3gram.arpa\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "c2c8c8ce",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=== 1/5 Counting and sorting n-grams ===\n",
      "Reading /workspace/kenlm_te/src/kenlm_text_te.txt\n",
      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
      "****************************************************************************************************\n",
      "Unigram tokens 32852369 types 1308846\n",
      "=== 2/5 Calculating and sorting adjusted counts ===\n",
      "Chain sizes: 1:15706152 2:14474877952 3:27140399104 4:43424632832 5:63327596544\n",
      "Statistics:\n",
      "1 1308845 D1=0.726852 D2=1.02775 D3+=1.30996\n",
      "2 12720239 D1=0.818931 D2=1.12897 D3+=1.32699\n",
      "3 23789023 D1=0.910002 D2=1.27136 D3+=1.38596\n",
      "4 28332665 D1=0.955371 D2=1.42566 D3+=1.4677\n",
      "5 30063763 D1=0.898851 D2=1.71714 D3+=1.29889\n",
      "Memory estimate for binary LM:\n",
      "type      MB\n",
      "probing 2032 assuming -p 1.5\n",
      "probing 2408 assuming -r models -p 1.5\n",
      "trie    1058 without quantization\n",
      "trie     613 assuming -q 8 -b 8 quantization \n",
      "trie     921 assuming -a 22 array pointer compression\n",
      "trie     476 assuming -a 22 -q 8 -b 8 array pointer compression and quantization\n",
      "=== 3/5 Calculating and sorting initial probabilities ===\n",
      "Chain sizes: 1:15706140 2:203523824 3:475780460 4:679983960 5:841785364\n",
      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
      "####################################################################################################\n",
      "=== 4/5 Calculating and writing order-interpolated probabilities ===\n",
      "Chain sizes: 1:15706140 2:203523824 3:475780460 4:679983960 5:841785364\n",
      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
      "####################################################################################################\n",
      "=== 5/5 Writing ARPA model ===\n",
      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
      "****************************************************************************************************\n",
      "Name:lmplz\tVmPeak:145104204 kB\tVmRSS:38296 kB\tRSSMax:26419104 kB\tuser:89.0779\tsys:42.0565\tCPU:131.134\treal:97.4678\n"
     ]
    }
   ],
   "source": [
    "!../../kenlm/build/bin/lmplz -o 5 <\"kenlm_text_te.txt\" > \"../5gram.arpa\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "62b727b7",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "c27f1ef3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 19.1 s, sys: 3.81 s, total: 22.9 s\n",
      "Wall time: 22.9 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "with open(\"../3gram.arpa\", \"r\") as read_file, open(\"../3gram_correct.arpa\", \"w\") as write_file:\n",
    "  has_added_eos = False\n",
    "  for line in read_file:\n",
    "    if not has_added_eos and \"ngram 1=\" in line:\n",
    "      count=line.strip().split(\"=\")[-1]\n",
    "      write_file.write(line.replace(f\"{count}\", f\"{int(count)+1}\"))\n",
    "    elif not has_added_eos and \"<s>\" in line:\n",
    "      write_file.write(line)\n",
    "      write_file.write(line.replace(\"<s>\", \"</s>\"))\n",
    "      has_added_eos = True\n",
    "    else:\n",
    "      write_file.write(line)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "8c8d963b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1min 5s, sys: 12.8 s, total: 1min 18s\n",
      "Wall time: 1min 18s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "with open(\"../5gram.arpa\", \"r\") as read_file, open(\"../5gram_correct.arpa\", \"w\") as write_file:\n",
    "  has_added_eos = False\n",
    "  for line in read_file:\n",
    "    if not has_added_eos and \"ngram 1=\" in line:\n",
    "      count=line.strip().split(\"=\")[-1]\n",
    "      write_file.write(line.replace(f\"{count}\", f\"{int(count)+1}\"))\n",
    "    elif not has_added_eos and \"<s>\" in line:\n",
    "      write_file.write(line)\n",
    "      write_file.write(line.replace(\"<s>\", \"</s>\"))\n",
    "      has_added_eos = True\n",
    "    else:\n",
    "      write_file.write(line)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9447691c",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "95d50071",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}