{ "cells": [ { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset, concatenate_datasets\n" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Reusing dataset common_voice (/home/ubuntu/.cache/huggingface/datasets/mozilla-foundation___common_voice/mr/8.0.0/b8bc4d453193c06a43269b46cd87f075c70f152ac963b7f28f7a2760c45ec3e8)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Dataset({\n", " features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],\n", " num_rows: 698\n", "})\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Reusing dataset open_slr (/home/ubuntu/.cache/huggingface/datasets/open_slr/SLR64/0.0.0/e0fb9e36094eff565efe812d1aba158f6a46ce834cb9705c91d1e2d6ba78ed31)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Dataset({\n", " features: ['path', 'audio', 'sentence'],\n", " num_rows: 1569\n", "})\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Using custom data configuration shivam--marathi_samanantar_processed-538aa7995793bd87\n", "Reusing dataset parquet (/home/ubuntu/.cache/huggingface/datasets/parquet/shivam--marathi_samanantar_processed-538aa7995793bd87/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Dataset({\n", " features: ['text'],\n", " num_rows: 3047226\n", "})\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Using custom data configuration shivam--marathi_pib_processed-2348554e5319bdfe\n", "Reusing dataset parquet (/home/ubuntu/.cache/huggingface/datasets/parquet/shivam--marathi_pib_processed-2348554e5319bdfe/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Dataset({\n", " features: ['text'],\n", " num_rows: 117199\n", "})\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Reusing dataset opus100 (/home/ubuntu/.cache/huggingface/datasets/opus100/en-mr/0.0.0/256f3196b69901fb0c79810ef468e2c4ed84fbd563719920b1ff1fdc750f7704)\n", "Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/opus100/en-mr/0.0.0/256f3196b69901fb0c79810ef468e2c4ed84fbd563719920b1ff1fdc750f7704/cache-201d21d7acc2864f.arrow\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Dataset({\n", " features: ['translation', 'sentence'],\n", " num_rows: 27007\n", "})\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Reusing dataset tatoeba (/home/ubuntu/.cache/huggingface/datasets/tatoeba/en-mr/2021.7.22/b3ea9c6bb2af47699c5fc0a155643f5a0da287c7095ea14824ee0a8afd74daf6)\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "c0dba507cea344768aa20cd7c5593a0c", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/53462 [00:00 \"5gram.arpa\"" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\\data\\\r\n", "ngram 1=776335\r\n", "ngram 2=8433103\r\n", "ngram 3=18421039\r\n", "ngram 4=24029132\r\n", "ngram 5=26433229\r\n", "\r\n", "\\1-grams:\r\n", "-6.9649706\t\t0\r\n", "0\t\t-0.10200334\r\n", "-3.8677218\tशिवाय\t-0.29601222\r\n", "-3.0139472\tत्यांना\t-0.54708624\r\n", "-5.7931695\tकवितेचा\t-0.10200334\r\n", "-2.2375891\tआणि\t-0.5685015\r\n", "-6.046465\tचित्रकलेचा\t-0.16192785\r\n", "-4.874536\tछंद\t-0.3758324\r\n", "-3.150044\tहोता\t-0.53179973\r\n", "-6.514799\tपारंपरिकदृष्ट्या\t-0.10200334\r\n", "-4.837577\tज्वारी\t-0.3880814\r\n", "-4.9689674\tबाजरी\t-0.32780117\r\n" ] } ], "source": [ "!head -20 5gram.arpa" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [], "source": [ "with open(\"5gram.arpa\", \"r\") as read_file, open(\"5gram_correct.arpa\", \"w\") as write_file:\n", " has_added_eos = False\n", " for line in read_file:\n", " if not has_added_eos and \"ngram 1=\" in line:\n", " count=line.strip().split(\"=\")[-1]\n", " write_file.write(line.replace(f\"{count}\", f\"{int(count)+1}\"))\n", " elif not has_added_eos and \"\" in line:\n", " write_file.write(line)\n", " write_file.write(line.replace(\"\", \"\"))\n", " has_added_eos = True\n", " else:\n", " write_file.write(line)" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\\data\\\r\n", "ngram 1=776336\r\n", "ngram 2=8433103\r\n", "ngram 3=18421039\r\n", "ngram 4=24029132\r\n", "ngram 5=26433229\r\n", "\r\n", "\\1-grams:\r\n", "-6.9649706\t\t0\r\n", "0\t\t-0.10200334\r\n", "0\t\t-0.10200334\r\n", "-3.8677218\tशिवाय\t-0.29601222\r\n", "-3.0139472\tत्यांना\t-0.54708624\r\n", "-5.7931695\tकवितेचा\t-0.10200334\r\n", "-2.2375891\tआणि\t-0.5685015\r\n", "-6.046465\tचित्रकलेचा\t-0.16192785\r\n", "-4.874536\tछंद\t-0.3758324\r\n", "-3.150044\tहोता\t-0.53179973\r\n", "-6.514799\tपारंपरिकदृष्ट्या\t-0.10200334\r\n", "-4.837577\tज्वारी\t-0.3880814\r\n" ] } ], "source": [ "!head -20 5gram_correct.arpa" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoProcessor\n", "\n", "processor = AutoProcessor.from_pretrained(\"smangrul/xls-r-300m-mr\")" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'|': 0,\n", " 'ँ': 1,\n", " 'ं': 2,\n", " 'ः': 3,\n", " 'अ': 4,\n", " 'आ': 5,\n", " 'इ': 6,\n", " 'ई': 7,\n", " 'उ': 8,\n", " 'ऊ': 9,\n", " 'ऋ': 10,\n", " 'ए': 11,\n", " 'ऐ': 12,\n", " 'ऑ': 13,\n", " 'ओ': 14,\n", " 'औ': 15,\n", " 'क': 16,\n", " 'ख': 17,\n", " 'ग': 18,\n", " 'घ': 19,\n", " 'च': 20,\n", " 'छ': 21,\n", " 'ज': 22,\n", " 'झ': 23,\n", " 'ञ': 24,\n", " 'ट': 25,\n", " 'ठ': 26,\n", " 'ड': 27,\n", " 'ढ': 28,\n", " 'ण': 29,\n", " 'त': 30,\n", " 'थ': 31,\n", " 'द': 32,\n", " 'ध': 33,\n", " 'न': 34,\n", " 'प': 35,\n", " 'फ': 36,\n", " 'ब': 37,\n", " 'भ': 38,\n", " 'म': 39,\n", " 'य': 40,\n", " 'र': 41,\n", " 'ऱ': 42,\n", " 'ल': 43,\n", " 'ळ': 44,\n", " 'व': 45,\n", " 'श': 46,\n", " 'ष': 47,\n", " 'स': 48,\n", " 'ह': 49,\n", " '़': 50,\n", " 'ा': 51,\n", " 'ि': 52,\n", " 'ी': 53,\n", " 'ु': 54,\n", " 'ू': 55,\n", " 'ृ': 56,\n", " 'ॄ': 57,\n", " 'ॅ': 58,\n", " 'े': 59,\n", " 'ै': 60,\n", " 'ॉ': 61,\n", " 'ॊ': 62,\n", " 'ो': 63,\n", " 'ौ': 64,\n", " '्': 65,\n", " 'ॲ': 66,\n", " '[unk]': 67,\n", " '[pad]': 68,\n", " '': 69,\n", " '': 70}" ] }, "execution_count": 88, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vocab_dict = processor.tokenizer.get_vocab()\n", "sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}\n", "sorted_vocab_dict\n" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Loading the LM will be faster if you build a binary file.\n", "Reading /ebs/learn/ASR/5gram_correct.arpa\n", "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n", "****************************************************************************************************\n", "Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?\n", "Unigrams and labels don't seem to agree.\n" ] } ], "source": [ "from pyctcdecode import build_ctcdecoder\n", "\n", "decoder = build_ctcdecoder(\n", " labels=list(sorted_vocab_dict.keys()),\n", " kenlm_model_path=\"5gram_correct.arpa\",\n", ")" ] }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 90, "metadata": {}, "output_type": "execute_result" } ], "source": [ "decoder" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [], "source": [ "from transformers import Wav2Vec2ProcessorWithLM\n", "\n", "processor_with_lm = Wav2Vec2ProcessorWithLM(\n", " feature_extractor=processor.feature_extractor,\n", " tokenizer=processor.tokenizer,\n", " decoder=decoder\n", ")" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [], "source": [ "processor_with_lm.save_pretrained(\"./smangrul/xls-r-300m-mr/\")" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [], "source": [ "processor_with_lm.save_pretrained(\"./../xls-r-300m-mr-model/\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "hf", "language": "python", "name": "hf" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }