{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset, concatenate_datasets\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Reusing dataset common_voice (/home/ubuntu/.cache/huggingface/datasets/mozilla-foundation___common_voice/mr/8.0.0/b8bc4d453193c06a43269b46cd87f075c70f152ac963b7f28f7a2760c45ec3e8)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset({\n",
      "    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],\n",
      "    num_rows: 698\n",
      "})\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Reusing dataset open_slr (/home/ubuntu/.cache/huggingface/datasets/open_slr/SLR64/0.0.0/e0fb9e36094eff565efe812d1aba158f6a46ce834cb9705c91d1e2d6ba78ed31)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset({\n",
      "    features: ['path', 'audio', 'sentence'],\n",
      "    num_rows: 1569\n",
      "})\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using custom data configuration shivam--marathi_samanantar_processed-538aa7995793bd87\n",
      "Reusing dataset parquet (/home/ubuntu/.cache/huggingface/datasets/parquet/shivam--marathi_samanantar_processed-538aa7995793bd87/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset({\n",
      "    features: ['text'],\n",
      "    num_rows: 3047226\n",
      "})\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using custom data configuration shivam--marathi_pib_processed-2348554e5319bdfe\n",
      "Reusing dataset parquet (/home/ubuntu/.cache/huggingface/datasets/parquet/shivam--marathi_pib_processed-2348554e5319bdfe/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset({\n",
      "    features: ['text'],\n",
      "    num_rows: 117199\n",
      "})\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Reusing dataset opus100 (/home/ubuntu/.cache/huggingface/datasets/opus100/en-mr/0.0.0/256f3196b69901fb0c79810ef468e2c4ed84fbd563719920b1ff1fdc750f7704)\n",
      "Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/opus100/en-mr/0.0.0/256f3196b69901fb0c79810ef468e2c4ed84fbd563719920b1ff1fdc750f7704/cache-201d21d7acc2864f.arrow\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset({\n",
      "    features: ['translation', 'sentence'],\n",
      "    num_rows: 27007\n",
      "})\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Reusing dataset tatoeba (/home/ubuntu/.cache/huggingface/datasets/tatoeba/en-mr/2021.7.22/b3ea9c6bb2af47699c5fc0a155643f5a0da287c7095ea14824ee0a8afd74daf6)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c0dba507cea344768aa20cd7c5593a0c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/53462 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset({\n",
      "    features: ['id', 'translation', 'sentence'],\n",
      "    num_rows: 53462\n",
      "})\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Reusing dataset tapaco (/home/ubuntu/.cache/huggingface/datasets/tapaco/mr/1.0.0/71d200534b520a174927a8f0479c06220a0a6fb5201a84ebfce19006c6354698)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset({\n",
      "    features: ['paraphrase_set_id', 'sentence_id', 'paraphrase', 'lists', 'tags', 'language'],\n",
      "    num_rows: 16413\n",
      "})\n"
     ]
    }
   ],
   "source": [
    "cv = load_dataset(\"mozilla-foundation/common_voice_8_0\", \"mr\", split=\"train+validation\", use_auth_token=True)\n",
    "print(cv)\n",
    "openslr = load_dataset(\"openslr\", \"SLR64\", split=\"train\")\n",
    "print(openslr)\n",
    "samanantar = load_dataset(\"shivam/marathi_samanantar_processed\", split=\"train\")\n",
    "print(samanantar)\n",
    "pib = load_dataset(\"shivam/marathi_pib_processed\", split=\"train\")\n",
    "print(pib)\n",
    "opus = load_dataset(\"opus100\", \"en-mr\", split=\"train\").map(lambda example: {\"sentence\": example[\"translation\"][\"mr\"]})\n",
    "print(opus)\n",
    "tatoeba = load_dataset(\"tatoeba\", \"en-mr\", split=\"train\").map(lambda example: {\"sentence\": example[\"translation\"][\"mr\"]})\n",
    "print(tatoeba)\n",
    "tapaco = load_dataset(\"tapaco\", \"mr\", split=\"train\")\n",
    "print(tapaco)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['sentence'],\n",
       "    num_rows: 3263574\n",
       "})"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cv = cv.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"segment\", \"up_votes\", 'path', 'audio'])\n",
    "openslr = openslr.remove_columns(['path', 'audio'])\n",
    "samanantar = samanantar.rename_column(\"text\",\"sentence\")\n",
    "pib = pib.rename_column(\"text\",\"sentence\")\n",
    "opus = opus.remove_columns([\"translation\"])\n",
    "tatoeba = tatoeba.remove_columns(['id','translation'])\n",
    "tapaco = tapaco.remove_columns(['paraphrase_set_id', 'sentence_id', 'lists', 'tags', 'language']).rename_column(\"paraphrase\",\"sentence\")\n",
    "\n",
    "text_dataset = concatenate_datasets([cv, openslr, samanantar, pib, opus, tatoeba, tapaco])\n",
    "text_dataset\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "chars_to_ignore_regex = '[,?.!\\-\\;\\:\"“%‘”�—’…–\\।\\!\\\"\\,\\-\\.\\?\\:\\|\\“\\”\\–\\;\\'\\’\\‘\\॔]'   # change to the ignored characters of your fine-tuned model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "def extract_text(batch):\n",
    "    text = batch[\"sentence\"]\n",
    "    batch[\"text\"] = re.sub(chars_to_ignore_regex, \"\", text.lower())\n",
    "    return batch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4334d72e02f140bf9078cb97c5353d70",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/3263574 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['text'],\n",
       "    num_rows: 3263574\n",
       "})"
      ]
     },
     "execution_count": 76,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset = text_dataset.map(extract_text, remove_columns=text_dataset.column_names)\n",
    "dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'text': 'शिवाय त्यांना कवितेचा आणि चित्रकलेचा छंद होता'}"
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"text.txt\", \"w\") as file:\n",
    "    file.write(\" \".join(dataset[\"text\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=== 1/5 Counting and sorting n-grams ===\n",
      "Reading /ebs/learn/ASR/text.txt\n",
      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
      "****************************************************************************************************\n",
      "Unigram tokens 29706056 types 776336\n",
      "=== 2/5 Calculating and sorting adjusted counts ===\n",
      "Chain sizes: 1:9316032 2:20102516736 3:37692219392 4:60307550208 5:87948517376\n",
      "Statistics:\n",
      "1 776335 D1=0.705463 D2=1.0456 D3+=1.33671\n",
      "2 8433103 D1=0.790673 D2=1.11187 D3+=1.35296\n",
      "3 18421039 D1=0.878727 D2=1.22916 D3+=1.39519\n",
      "4 24029132 D1=0.935948 D2=1.36969 D3+=1.49375\n",
      "5 26433229 D1=0.885046 D2=1.58244 D3+=2.0281\n",
      "Memory estimate for binary LM:\n",
      "type      MB\n",
      "probing 1637 assuming -p 1.5\n",
      "probing 1931 assuming -r models -p 1.5\n",
      "trie     833 without quantization\n",
      "trie     476 assuming -q 8 -b 8 quantization \n",
      "trie     726 assuming -a 22 array pointer compression\n",
      "trie     368 assuming -a 22 -q 8 -b 8 array pointer compression and quantization\n",
      "=== 3/5 Calculating and sorting initial probabilities ===\n",
      "Chain sizes: 1:9316020 2:134929648 3:368420780 4:576699168 5:740130412\n",
      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
      "####################################################################################################\n",
      "=== 4/5 Calculating and writing order-interpolated probabilities ===\n",
      "Chain sizes: 1:9316020 2:134929648 3:368420780 4:576699168 5:740130412\n",
      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
      "####################################################################################################\n",
      "=== 5/5 Writing ARPA model ===\n",
      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
      "****************************************************************************************************\n",
      "Name:lmplz\tVmPeak:201429316 kB\tVmRSS:29888 kB\tRSSMax:36259508 kB\tuser:86.1274\tsys:40.4955\tCPU:126.623\treal:99.6214\n"
     ]
    }
   ],
   "source": [
    "!kenlm/build/bin/lmplz -o 5 <\"text.txt\" > \"5gram.arpa\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\\data\\\r\n",
      "ngram 1=776335\r\n",
      "ngram 2=8433103\r\n",
      "ngram 3=18421039\r\n",
      "ngram 4=24029132\r\n",
      "ngram 5=26433229\r\n",
      "\r\n",
      "\\1-grams:\r\n",
      "-6.9649706\t<unk>\t0\r\n",
      "0\t<s>\t-0.10200334\r\n",
      "-3.8677218\tशिवाय\t-0.29601222\r\n",
      "-3.0139472\tत्यांना\t-0.54708624\r\n",
      "-5.7931695\tकवितेचा\t-0.10200334\r\n",
      "-2.2375891\tआणि\t-0.5685015\r\n",
      "-6.046465\tचित्रकलेचा\t-0.16192785\r\n",
      "-4.874536\tछंद\t-0.3758324\r\n",
      "-3.150044\tहोता\t-0.53179973\r\n",
      "-6.514799\tपारंपरिकदृष्ट्या\t-0.10200334\r\n",
      "-4.837577\tज्वारी\t-0.3880814\r\n",
      "-4.9689674\tबाजरी\t-0.32780117\r\n"
     ]
    }
   ],
   "source": [
    "!head -20 5gram.arpa"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"5gram.arpa\", \"r\") as read_file, open(\"5gram_correct.arpa\", \"w\") as write_file:\n",
    "    has_added_eos = False\n",
    "    for line in read_file:\n",
    "        if not has_added_eos and \"ngram 1=\" in line:\n",
    "            count=line.strip().split(\"=\")[-1]\n",
    "            write_file.write(line.replace(f\"{count}\", f\"{int(count)+1}\"))\n",
    "        elif not has_added_eos and \"<s>\" in line:\n",
    "            write_file.write(line)\n",
    "            write_file.write(line.replace(\"<s>\", \"</s>\"))\n",
    "            has_added_eos = True\n",
    "        else:\n",
    "            write_file.write(line)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\\data\\\r\n",
      "ngram 1=776336\r\n",
      "ngram 2=8433103\r\n",
      "ngram 3=18421039\r\n",
      "ngram 4=24029132\r\n",
      "ngram 5=26433229\r\n",
      "\r\n",
      "\\1-grams:\r\n",
      "-6.9649706\t<unk>\t0\r\n",
      "0\t<s>\t-0.10200334\r\n",
      "0\t</s>\t-0.10200334\r\n",
      "-3.8677218\tशिवाय\t-0.29601222\r\n",
      "-3.0139472\tत्यांना\t-0.54708624\r\n",
      "-5.7931695\tकवितेचा\t-0.10200334\r\n",
      "-2.2375891\tआणि\t-0.5685015\r\n",
      "-6.046465\tचित्रकलेचा\t-0.16192785\r\n",
      "-4.874536\tछंद\t-0.3758324\r\n",
      "-3.150044\tहोता\t-0.53179973\r\n",
      "-6.514799\tपारंपरिकदृष्ट्या\t-0.10200334\r\n",
      "-4.837577\tज्वारी\t-0.3880814\r\n"
     ]
    }
   ],
   "source": [
    "!head -20 5gram_correct.arpa"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoProcessor\n",
    "\n",
    "processor = AutoProcessor.from_pretrained(\"smangrul/xls-r-300m-mr\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'|': 0,\n",
       " 'ँ': 1,\n",
       " 'ं': 2,\n",
       " 'ः': 3,\n",
       " 'अ': 4,\n",
       " 'आ': 5,\n",
       " 'इ': 6,\n",
       " 'ई': 7,\n",
       " 'उ': 8,\n",
       " 'ऊ': 9,\n",
       " 'ऋ': 10,\n",
       " 'ए': 11,\n",
       " 'ऐ': 12,\n",
       " 'ऑ': 13,\n",
       " 'ओ': 14,\n",
       " 'औ': 15,\n",
       " 'क': 16,\n",
       " 'ख': 17,\n",
       " 'ग': 18,\n",
       " 'घ': 19,\n",
       " 'च': 20,\n",
       " 'छ': 21,\n",
       " 'ज': 22,\n",
       " 'झ': 23,\n",
       " 'ञ': 24,\n",
       " 'ट': 25,\n",
       " 'ठ': 26,\n",
       " 'ड': 27,\n",
       " 'ढ': 28,\n",
       " 'ण': 29,\n",
       " 'त': 30,\n",
       " 'थ': 31,\n",
       " 'द': 32,\n",
       " 'ध': 33,\n",
       " 'न': 34,\n",
       " 'प': 35,\n",
       " 'फ': 36,\n",
       " 'ब': 37,\n",
       " 'भ': 38,\n",
       " 'म': 39,\n",
       " 'य': 40,\n",
       " 'र': 41,\n",
       " 'ऱ': 42,\n",
       " 'ल': 43,\n",
       " 'ळ': 44,\n",
       " 'व': 45,\n",
       " 'श': 46,\n",
       " 'ष': 47,\n",
       " 'स': 48,\n",
       " 'ह': 49,\n",
       " '़': 50,\n",
       " 'ा': 51,\n",
       " 'ि': 52,\n",
       " 'ी': 53,\n",
       " 'ु': 54,\n",
       " 'ू': 55,\n",
       " 'ृ': 56,\n",
       " 'ॄ': 57,\n",
       " 'ॅ': 58,\n",
       " 'े': 59,\n",
       " 'ै': 60,\n",
       " 'ॉ': 61,\n",
       " 'ॊ': 62,\n",
       " 'ो': 63,\n",
       " 'ौ': 64,\n",
       " '्': 65,\n",
       " 'ॲ': 66,\n",
       " '[unk]': 67,\n",
       " '[pad]': 68,\n",
       " '<s>': 69,\n",
       " '</s>': 70}"
      ]
     },
     "execution_count": 88,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab_dict = processor.tokenizer.get_vocab()\n",
    "sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}\n",
    "sorted_vocab_dict\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading the LM will be faster if you build a binary file.\n",
      "Reading /ebs/learn/ASR/5gram_correct.arpa\n",
      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
      "****************************************************************************************************\n",
      "Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?\n",
      "Unigrams and labels don't seem to agree.\n"
     ]
    }
   ],
   "source": [
    "from pyctcdecode import build_ctcdecoder\n",
    "\n",
    "decoder = build_ctcdecoder(\n",
    "    labels=list(sorted_vocab_dict.keys()),\n",
    "    kenlm_model_path=\"5gram_correct.arpa\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<pyctcdecode.decoder.BeamSearchDecoderCTC at 0x7fe8a63c65d0>"
      ]
     },
     "execution_count": 90,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "decoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import Wav2Vec2ProcessorWithLM\n",
    "\n",
    "processor_with_lm = Wav2Vec2ProcessorWithLM(\n",
    "    feature_extractor=processor.feature_extractor,\n",
    "    tokenizer=processor.tokenizer,\n",
    "    decoder=decoder\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [],
   "source": [
    "processor_with_lm.save_pretrained(\"./smangrul/xls-r-300m-mr/\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [],
   "source": [
    "processor_with_lm.save_pretrained(\"./../xls-r-300m-mr-model/\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "hf",
   "language": "python",
   "name": "hf"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}