{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "451d890e",
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "eb0e4037",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3451cb7648e349cbbbdea3b672207ef7",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading:   0%|          | 0.00/1.68k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using custom data configuration chmanoj--ai4bharat__samanantar_processed_te-ec4e27c180ab4035\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading and preparing dataset samanantar/te (download: 292.93 MiB, generated: 678.62 MiB, post-processed: Unknown size, total: 971.55 MiB) to /workspace/cache/hf/datasets/parquet/chmanoj--ai4bharat__samanantar_processed_te-ec4e27c180ab4035/0.0.0/1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "68ea006ea9b943c3af2ed5ee7bb9fffb",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b5276db8e4614107ad0bdfe67ccca2fd",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading:   0%|          | 0.00/151M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0d3e27b107e7401dbe7f5dad8aa7ec08",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading:   0%|          | 0.00/156M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ead9e8fde9a842b295955332ecae540d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset parquet downloaded and prepared to /workspace/cache/hf/datasets/parquet/chmanoj--ai4bharat__samanantar_processed_te-ec4e27c180ab4035/0.0.0/1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121. Subsequent calls will reuse this data.\n"
     ]
    }
   ],
   "source": [
    "dataset = load_dataset(f\"chmanoj/ai4bharat__samanantar_processed_te\", split=\"train\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "e4f4f4e8",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"kenlm_text_te.txt\", \"w\") as file:\n",
    "  file.write(\" \".join(dataset[\"text\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6e8a0e84",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "5dfbf3e1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'/workspace/kenlm_te/src'"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import os\n",
    "os.getcwd()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "494bec1a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=== 1/5 Counting and sorting n-grams ===\n",
      "Reading /workspace/kenlm_te/src/kenlm_text_te.txt\n",
      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
      "****************************************************************************************************\n",
      "Unigram tokens 32852369 types 1308846\n",
      "=== 2/5 Calculating and sorting adjusted counts ===\n",
      "Chain sizes: 1:15706152 2:51606089728 3:96761421824\n",
      "Statistics:\n",
      "1 1308845 D1=0.726852 D2=1.02775 D3+=1.30996\n",
      "2 12720239 D1=0.818931 D2=1.12897 D3+=1.32699\n",
      "3 23789023 D1=0.823705 D2=1.50814 D3+=1.24837\n",
      "Memory estimate for binary LM:\n",
      "type     MB\n",
      "probing 731 assuming -p 1.5\n",
      "probing 809 assuming -r models -p 1.5\n",
      "trie    342 without quantization\n",
      "trie    206 assuming -q 8 -b 8 quantization \n",
      "trie    316 assuming -a 22 array pointer compression\n",
      "trie    180 assuming -a 22 -q 8 -b 8 array pointer compression and quantization\n",
      "=== 3/5 Calculating and sorting initial probabilities ===\n",
      "Chain sizes: 1:15706140 2:203523824 3:475780460\n",
      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
      "####################################################################################################\n",
      "=== 4/5 Calculating and writing order-interpolated probabilities ===\n",
      "Chain sizes: 1:15706140 2:203523824 3:475780460\n",
      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
      "####################################################################################################\n",
      "=== 5/5 Writing ARPA model ===\n",
      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
      "****************************************************************************************************\n",
      "Name:lmplz\tVmPeak:145080616 kB\tVmRSS:38292 kB\tRSSMax:33928732 kB\tuser:43.6485\tsys:27.5682\tCPU:71.2168\treal:64.983\n"
     ]
    }
   ],
   "source": [
    "!../../kenlm/build/bin/lmplz -o 3 <\"kenlm_text_te.txt\" > \"../3gram.arpa\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "c2c8c8ce",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=== 1/5 Counting and sorting n-grams ===\n",
      "Reading /workspace/kenlm_te/src/kenlm_text_te.txt\n",
      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
      "****************************************************************************************************\n",
      "Unigram tokens 32852369 types 1308846\n",
      "=== 2/5 Calculating and sorting adjusted counts ===\n",
      "Chain sizes: 1:15706152 2:14474877952 3:27140399104 4:43424632832 5:63327596544\n",
      "Statistics:\n",
      "1 1308845 D1=0.726852 D2=1.02775 D3+=1.30996\n",
      "2 12720239 D1=0.818931 D2=1.12897 D3+=1.32699\n",
      "3 23789023 D1=0.910002 D2=1.27136 D3+=1.38596\n",
      "4 28332665 D1=0.955371 D2=1.42566 D3+=1.4677\n",
      "5 30063763 D1=0.898851 D2=1.71714 D3+=1.29889\n",
      "Memory estimate for binary LM:\n",
      "type      MB\n",
      "probing 2032 assuming -p 1.5\n",
      "probing 2408 assuming -r models -p 1.5\n",
      "trie    1058 without quantization\n",
      "trie     613 assuming -q 8 -b 8 quantization \n",
      "trie     921 assuming -a 22 array pointer compression\n",
      "trie     476 assuming -a 22 -q 8 -b 8 array pointer compression and quantization\n",
      "=== 3/5 Calculating and sorting initial probabilities ===\n",
      "Chain sizes: 1:15706140 2:203523824 3:475780460 4:679983960 5:841785364\n",
      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
      "####################################################################################################\n",
      "=== 4/5 Calculating and writing order-interpolated probabilities ===\n",
      "Chain sizes: 1:15706140 2:203523824 3:475780460 4:679983960 5:841785364\n",
      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
      "####################################################################################################\n",
      "=== 5/5 Writing ARPA model ===\n",
      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
      "****************************************************************************************************\n",
      "Name:lmplz\tVmPeak:145104204 kB\tVmRSS:38296 kB\tRSSMax:26419104 kB\tuser:89.0779\tsys:42.0565\tCPU:131.134\treal:97.4678\n"
     ]
    }
   ],
   "source": [
    "!../../kenlm/build/bin/lmplz -o 5 <\"kenlm_text_te.txt\" > \"../5gram.arpa\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "62b727b7",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "c27f1ef3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 19.1 s, sys: 3.81 s, total: 22.9 s\n",
      "Wall time: 22.9 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "with open(\"../3gram.arpa\", \"r\") as read_file, open(\"../3gram_correct.arpa\", \"w\") as write_file:\n",
    "  has_added_eos = False\n",
    "  for line in read_file:\n",
    "    if not has_added_eos and \"ngram 1=\" in line:\n",
    "      count=line.strip().split(\"=\")[-1]\n",
    "      write_file.write(line.replace(f\"{count}\", f\"{int(count)+1}\"))\n",
    "    elif not has_added_eos and \"<s>\" in line:\n",
    "      write_file.write(line)\n",
    "      write_file.write(line.replace(\"<s>\", \"</s>\"))\n",
    "      has_added_eos = True\n",
    "    else:\n",
    "      write_file.write(line)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "8c8d963b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1min 5s, sys: 12.8 s, total: 1min 18s\n",
      "Wall time: 1min 18s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "with open(\"../5gram.arpa\", \"r\") as read_file, open(\"../5gram_correct.arpa\", \"w\") as write_file:\n",
    "  has_added_eos = False\n",
    "  for line in read_file:\n",
    "    if not has_added_eos and \"ngram 1=\" in line:\n",
    "      count=line.strip().split(\"=\")[-1]\n",
    "      write_file.write(line.replace(f\"{count}\", f\"{int(count)+1}\"))\n",
    "    elif not has_added_eos and \"<s>\" in line:\n",
    "      write_file.write(line)\n",
    "      write_file.write(line.replace(\"<s>\", \"</s>\"))\n",
    "      has_added_eos = True\n",
    "    else:\n",
    "      write_file.write(line)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9447691c",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "95d50071",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}