smangrul
/

xls-r-mr-model

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset, concatenate_datasets\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 70,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Reusing dataset common_voice (/home/ubuntu/.cache/huggingface/datasets/mozilla-foundation___common_voice/mr/8.0.0/b8bc4d453193c06a43269b46cd87f075c70f152ac963b7f28f7a2760c45ec3e8)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset({\n",
+      "    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],\n",
+      "    num_rows: 698\n",
+      "})\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Reusing dataset open_slr (/home/ubuntu/.cache/huggingface/datasets/open_slr/SLR64/0.0.0/e0fb9e36094eff565efe812d1aba158f6a46ce834cb9705c91d1e2d6ba78ed31)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset({\n",
+      "    features: ['path', 'audio', 'sentence'],\n",
+      "    num_rows: 1569\n",
+      "})\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using custom data configuration shivam--marathi_samanantar_processed-538aa7995793bd87\n",
+      "Reusing dataset parquet (/home/ubuntu/.cache/huggingface/datasets/parquet/shivam--marathi_samanantar_processed-538aa7995793bd87/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset({\n",
+      "    features: ['text'],\n",
+      "    num_rows: 3047226\n",
+      "})\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using custom data configuration shivam--marathi_pib_processed-2348554e5319bdfe\n",
+      "Reusing dataset parquet (/home/ubuntu/.cache/huggingface/datasets/parquet/shivam--marathi_pib_processed-2348554e5319bdfe/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset({\n",
+      "    features: ['text'],\n",
+      "    num_rows: 117199\n",
+      "})\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Reusing dataset opus100 (/home/ubuntu/.cache/huggingface/datasets/opus100/en-mr/0.0.0/256f3196b69901fb0c79810ef468e2c4ed84fbd563719920b1ff1fdc750f7704)\n",
+      "Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/opus100/en-mr/0.0.0/256f3196b69901fb0c79810ef468e2c4ed84fbd563719920b1ff1fdc750f7704/cache-201d21d7acc2864f.arrow\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset({\n",
+      "    features: ['translation', 'sentence'],\n",
+      "    num_rows: 27007\n",
+      "})\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Reusing dataset tatoeba (/home/ubuntu/.cache/huggingface/datasets/tatoeba/en-mr/2021.7.22/b3ea9c6bb2af47699c5fc0a155643f5a0da287c7095ea14824ee0a8afd74daf6)\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c0dba507cea344768aa20cd7c5593a0c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/53462 [00:00<?, ?ex/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset({\n",
+      "    features: ['id', 'translation', 'sentence'],\n",
+      "    num_rows: 53462\n",
+      "})\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Reusing dataset tapaco (/home/ubuntu/.cache/huggingface/datasets/tapaco/mr/1.0.0/71d200534b520a174927a8f0479c06220a0a6fb5201a84ebfce19006c6354698)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset({\n",
+      "    features: ['paraphrase_set_id', 'sentence_id', 'paraphrase', 'lists', 'tags', 'language'],\n",
+      "    num_rows: 16413\n",
+      "})\n"
+     ]
+    }
+   ],
+   "source": [
+    "cv = load_dataset(\"mozilla-foundation/common_voice_8_0\", \"mr\", split=\"train+validation\", use_auth_token=True)\n",
+    "print(cv)\n",
+    "openslr = load_dataset(\"openslr\", \"SLR64\", split=\"train\")\n",
+    "print(openslr)\n",
+    "samanantar = load_dataset(\"shivam/marathi_samanantar_processed\", split=\"train\")\n",
+    "print(samanantar)\n",
+    "pib = load_dataset(\"shivam/marathi_pib_processed\", split=\"train\")\n",
+    "print(pib)\n",
+    "opus = load_dataset(\"opus100\", \"en-mr\", split=\"train\").map(lambda example: {\"sentence\": example[\"translation\"][\"mr\"]})\n",
+    "print(opus)\n",
+    "tatoeba = load_dataset(\"tatoeba\", \"en-mr\", split=\"train\").map(lambda example: {\"sentence\": example[\"translation\"][\"mr\"]})\n",
+    "print(tatoeba)\n",
+    "tapaco = load_dataset(\"tapaco\", \"mr\", split=\"train\")\n",
+    "print(tapaco)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 71,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['sentence'],\n",
+       "    num_rows: 3263574\n",
+       "})"
+      ]
+     },
+     "execution_count": 71,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cv = cv.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"segment\", \"up_votes\", 'path', 'audio'])\n",
+    "openslr = openslr.remove_columns(['path', 'audio'])\n",
+    "samanantar = samanantar.rename_column(\"text\",\"sentence\")\n",
+    "pib = pib.rename_column(\"text\",\"sentence\")\n",
+    "opus = opus.remove_columns([\"translation\"])\n",
+    "tatoeba = tatoeba.remove_columns(['id','translation'])\n",
+    "tapaco = tapaco.remove_columns(['paraphrase_set_id', 'sentence_id', 'lists', 'tags', 'language']).rename_column(\"paraphrase\",\"sentence\")\n",
+    "\n",
+    "text_dataset = concatenate_datasets([cv, openslr, samanantar, pib, opus, tatoeba, tapaco])\n",
+    "text_dataset\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 73,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chars_to_ignore_regex = '[,?.!\\-\\;\\:\"“%‘”�—’…–\\।\\!\\\"\\,\\-\\.\\?\\:\\|\\“\\”\\–\\;\\'\\’\\‘\\॔]'   # change to the ignored characters of your fine-tuned model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 74,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "\n",
+    "def extract_text(batch):\n",
+    "    text = batch[\"sentence\"]\n",
+    "    batch[\"text\"] = re.sub(chars_to_ignore_regex, \"\", text.lower())\n",
+    "    return batch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 76,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4334d72e02f140bf9078cb97c5353d70",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/3263574 [00:00<?, ?ex/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['text'],\n",
+       "    num_rows: 3263574\n",
+       "})"
+      ]
+     },
+     "execution_count": 76,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset = text_dataset.map(extract_text, remove_columns=text_dataset.column_names)\n",
+    "dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 77,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'text': 'शिवाय त्यांना कवितेचा आणि चित्रकलेचा छंद होता'}"
+      ]
+     },
+     "execution_count": 77,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 78,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"text.txt\", \"w\") as file:\n",
+    "    file.write(\" \".join(dataset[\"text\"]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 82,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=== 1/5 Counting and sorting n-grams ===\n",
+      "Reading /ebs/learn/ASR/text.txt\n",
+      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
+      "****************************************************************************************************\n",
+      "Unigram tokens 29706056 types 776336\n",
+      "=== 2/5 Calculating and sorting adjusted counts ===\n",
+      "Chain sizes: 1:9316032 2:20102516736 3:37692219392 4:60307550208 5:87948517376\n",
+      "Statistics:\n",
+      "1 776335 D1=0.705463 D2=1.0456 D3+=1.33671\n",
+      "2 8433103 D1=0.790673 D2=1.11187 D3+=1.35296\n",
+      "3 18421039 D1=0.878727 D2=1.22916 D3+=1.39519\n",
+      "4 24029132 D1=0.935948 D2=1.36969 D3+=1.49375\n",
+      "5 26433229 D1=0.885046 D2=1.58244 D3+=2.0281\n",
+      "Memory estimate for binary LM:\n",
+      "type      MB\n",
+      "probing 1637 assuming -p 1.5\n",
+      "probing 1931 assuming -r models -p 1.5\n",
+      "trie     833 without quantization\n",
+      "trie     476 assuming -q 8 -b 8 quantization \n",
+      "trie     726 assuming -a 22 array pointer compression\n",
+      "trie     368 assuming -a 22 -q 8 -b 8 array pointer compression and quantization\n",
+      "=== 3/5 Calculating and sorting initial probabilities ===\n",
+      "Chain sizes: 1:9316020 2:134929648 3:368420780 4:576699168 5:740130412\n",
+      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
+      "####################################################################################################\n",
+      "=== 4/5 Calculating and writing order-interpolated probabilities ===\n",
+      "Chain sizes: 1:9316020 2:134929648 3:368420780 4:576699168 5:740130412\n",
+      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
+      "####################################################################################################\n",
+      "=== 5/5 Writing ARPA model ===\n",
+      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
+      "****************************************************************************************************\n",
+      "Name:lmplz\tVmPeak:201429316 kB\tVmRSS:29888 kB\tRSSMax:36259508 kB\tuser:86.1274\tsys:40.4955\tCPU:126.623\treal:99.6214\n"
+     ]
+    }
+   ],
+   "source": [
+    "!kenlm/build/bin/lmplz -o 5 <\"text.txt\" > \"5gram.arpa\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 83,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\\data\\\r\n",
+      "ngram 1=776335\r\n",
+      "ngram 2=8433103\r\n",
+      "ngram 3=18421039\r\n",
+      "ngram 4=24029132\r\n",
+      "ngram 5=26433229\r\n",
+      "\r\n",
+      "\\1-grams:\r\n",
+      "-6.9649706\t<unk>\t0\r\n",
+      "0\t<s>\t-0.10200334\r\n",
+      "-3.8677218\tशिवाय\t-0.29601222\r\n",
+      "-3.0139472\tत्यांना\t-0.54708624\r\n",
+      "-5.7931695\tकवितेचा\t-0.10200334\r\n",
+      "-2.2375891\tआणि\t-0.5685015\r\n",
+      "-6.046465\tचित्रकलेचा\t-0.16192785\r\n",
+      "-4.874536\tछंद\t-0.3758324\r\n",
+      "-3.150044\tहोता\t-0.53179973\r\n",
+      "-6.514799\tपारंपरिकदृष्ट्या\t-0.10200334\r\n",
+      "-4.837577\tज्वारी\t-0.3880814\r\n",
+      "-4.9689674\tबाजरी\t-0.32780117\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "!head -20 5gram.arpa"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"5gram.arpa\", \"r\") as read_file, open(\"5gram_correct.arpa\", \"w\") as write_file:\n",
+    "    has_added_eos = False\n",
+    "    for line in read_file:\n",
+    "        if not has_added_eos and \"ngram 1=\" in line:\n",
+    "            count=line.strip().split(\"=\")[-1]\n",
+    "            write_file.write(line.replace(f\"{count}\", f\"{int(count)+1}\"))\n",
+    "        elif not has_added_eos and \"<s>\" in line:\n",
+    "            write_file.write(line)\n",
+    "            write_file.write(line.replace(\"<s>\", \"</s>\"))\n",
+    "            has_added_eos = True\n",
+    "        else:\n",
+    "            write_file.write(line)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 86,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\\data\\\r\n",
+      "ngram 1=776336\r\n",
+      "ngram 2=8433103\r\n",
+      "ngram 3=18421039\r\n",
+      "ngram 4=24029132\r\n",
+      "ngram 5=26433229\r\n",
+      "\r\n",
+      "\\1-grams:\r\n",
+      "-6.9649706\t<unk>\t0\r\n",
+      "0\t<s>\t-0.10200334\r\n",
+      "0\t</s>\t-0.10200334\r\n",
+      "-3.8677218\tशिवाय\t-0.29601222\r\n",
+      "-3.0139472\tत्यांना\t-0.54708624\r\n",
+      "-5.7931695\tकवितेचा\t-0.10200334\r\n",
+      "-2.2375891\tआणि\t-0.5685015\r\n",
+      "-6.046465\tचित्रकलेचा\t-0.16192785\r\n",
+      "-4.874536\tछंद\t-0.3758324\r\n",
+      "-3.150044\tहोता\t-0.53179973\r\n",
+      "-6.514799\tपारंपरिकदृष्ट्या\t-0.10200334\r\n",
+      "-4.837577\tज्वारी\t-0.3880814\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "!head -20 5gram_correct.arpa"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoProcessor\n",
+    "\n",
+    "processor = AutoProcessor.from_pretrained(\"smangrul/xls-r-300m-mr\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 88,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'|': 0,\n",
+       " 'ँ': 1,\n",
+       " 'ं': 2,\n",
+       " 'ः': 3,\n",
+       " 'अ': 4,\n",
+       " 'आ': 5,\n",
+       " 'इ': 6,\n",
+       " 'ई': 7,\n",
+       " 'उ': 8,\n",
+       " 'ऊ': 9,\n",
+       " 'ऋ': 10,\n",
+       " 'ए': 11,\n",
+       " 'ऐ': 12,\n",
+       " 'ऑ': 13,\n",
+       " 'ओ': 14,\n",
+       " 'औ': 15,\n",
+       " 'क': 16,\n",
+       " 'ख': 17,\n",
+       " 'ग': 18,\n",
+       " 'घ': 19,\n",
+       " 'च': 20,\n",
+       " 'छ': 21,\n",
+       " 'ज': 22,\n",
+       " 'झ': 23,\n",
+       " 'ञ': 24,\n",
+       " 'ट': 25,\n",
+       " 'ठ': 26,\n",
+       " 'ड': 27,\n",
+       " 'ढ': 28,\n",
+       " 'ण': 29,\n",
+       " 'त': 30,\n",
+       " 'थ': 31,\n",
+       " 'द': 32,\n",
+       " 'ध': 33,\n",
+       " 'न': 34,\n",
+       " 'प': 35,\n",
+       " 'फ': 36,\n",
+       " 'ब': 37,\n",
+       " 'भ': 38,\n",
+       " 'म': 39,\n",
+       " 'य': 40,\n",
+       " 'र': 41,\n",
+       " 'ऱ': 42,\n",
+       " 'ल': 43,\n",
+       " 'ळ': 44,\n",
+       " 'व': 45,\n",
+       " 'श': 46,\n",
+       " 'ष': 47,\n",
+       " 'स': 48,\n",
+       " 'ह': 49,\n",
+       " '़': 50,\n",
+       " 'ा': 51,\n",
+       " 'ि': 52,\n",
+       " 'ी': 53,\n",
+       " 'ु': 54,\n",
+       " 'ू': 55,\n",
+       " 'ृ': 56,\n",
+       " 'ॄ': 57,\n",
+       " 'ॅ': 58,\n",
+       " 'े': 59,\n",
+       " 'ै': 60,\n",
+       " 'ॉ': 61,\n",
+       " 'ॊ': 62,\n",
+       " 'ो': 63,\n",
+       " 'ौ': 64,\n",
+       " '्': 65,\n",
+       " 'ॲ': 66,\n",
+       " '[unk]': 67,\n",
+       " '[pad]': 68,\n",
+       " '<s>': 69,\n",
+       " '</s>': 70}"
+      ]
+     },
+     "execution_count": 88,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "vocab_dict = processor.tokenizer.get_vocab()\n",
+    "sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}\n",
+    "sorted_vocab_dict\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading the LM will be faster if you build a binary file.\n",
+      "Reading /ebs/learn/ASR/5gram_correct.arpa\n",
+      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
+      "****************************************************************************************************\n",
+      "Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?\n",
+      "Unigrams and labels don't seem to agree.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pyctcdecode import build_ctcdecoder\n",
+    "\n",
+    "decoder = build_ctcdecoder(\n",
+    "    labels=list(sorted_vocab_dict.keys()),\n",
+    "    kenlm_model_path=\"5gram_correct.arpa\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<pyctcdecode.decoder.BeamSearchDecoderCTC at 0x7fe8a63c65d0>"
+      ]
+     },
+     "execution_count": 90,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "decoder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 91,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import Wav2Vec2ProcessorWithLM\n",
+    "\n",
+    "processor_with_lm = Wav2Vec2ProcessorWithLM(\n",
+    "    feature_extractor=processor.feature_extractor,\n",
+    "    tokenizer=processor.tokenizer,\n",
+    "    decoder=decoder\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 92,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "processor_with_lm.save_pretrained(\"./smangrul/xls-r-300m-mr/\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 95,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "processor_with_lm.save_pretrained(\"./../xls-r-300m-mr-model/\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "hf",
+   "language": "python",
+   "name": "hf"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}