File size: 13,521 Bytes

7abf99e

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "831245a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "2ac8a30f",
   "metadata": {},
   "outputs": [],
   "source": [
    "target_lang=\"ga-IE\"  # change to your target lang"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "id": "15710167",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using custom data configuration ga-pl-lang1=ga,lang2=pl\n",
      "Reusing dataset opus_dgt (/workspace/cache/hf/datasets/opus_dgt/ga-pl-lang1=ga,lang2=pl/0.0.0/a4db75cea3712eb5d4384f0539db82abf897c6b6da5e5e81693e8fd201efc346)\n"
     ]
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "# dataset = load_dataset(\"mozilla-foundation/common_voice_8_0\", \n",
    "#                        \"ga-IE\", \n",
    "#                        split=\"train\", \n",
    "#                        use_auth_token = True)\n",
    "\n",
    "dataset = load_dataset(\"opus_dgt\", lang1=\"ga\", lang2=\"pl\", split = 'train')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "id": "fb20d4de",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ga_txt = [i['ga'] for i in dataset['translation']]\n",
    "# ga_txt = pd.Series(ga_txt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "id": "eeca1851",
   "metadata": {},
   "outputs": [],
   "source": [
    "chars_to_ignore_regex = '[,?.!\\-\\;\\:\"“%‘”�—’…–]'  # change to the ignored characters of your fine-tuned model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "id": "4df93c9c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "def extract_text(batch):\n",
    "  text = batch[\"translation\"]\n",
    "  ga_text = text['ga']\n",
    "  batch[\"text\"] = re.sub(chars_to_ignore_regex, \"\", ga_text.lower())\n",
    "  return batch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "id": "84bedd13",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d9a11f167bb94faa8e9f6a511407acb4",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "0ex [00:00, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "dataset = dataset.map(extract_text, remove_columns=dataset.column_names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "id": "31cb3c6b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "342d92a5d9c44c59bcb5dca143ced3b6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "dataset.push_to_hub(f\"{target_lang}_opus_dgt_train\", split=\"train\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "70952673",
   "metadata": {},
   "source": [
    "## N-gram KenLM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "id": "51756959",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "38d3c229117f4e60a7778f974ac609de",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading:   0%|          | 0.00/1.60k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using custom data configuration jcmc--ga-IE_opus_dgt_train-aa318da91f5f84f6\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading and preparing dataset opus_dgt/ga-pl (download: 12.11 MiB, generated: 28.99 MiB, post-processed: Unknown size, total: 41.11 MiB) to /workspace/cache/hf/datasets/parquet/jcmc--ga-IE_opus_dgt_train-aa318da91f5f84f6/0.0.0/1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e5e07f18549b443ead74991a9b338593",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0e83c78fa1bc43f19a56b623c92a64a4",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading:   0%|          | 0.00/12.7M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "06649f5cd3324eb49a1bd09b68aa23b6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset parquet downloaded and prepared to /workspace/cache/hf/datasets/parquet/jcmc--ga-IE_opus_dgt_train-aa318da91f5f84f6/0.0.0/1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121. Subsequent calls will reuse this data.\n"
     ]
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "dataset = load_dataset(\"jcmc/ga-IE_opus_dgt_train\", split=\"train\")\n",
    "\n",
    "with open(\"text.txt\", \"w\") as file:\n",
    "  file.write(\" \".join(dataset[\"text\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "id": "77eb3a41",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=== 1/5 Counting and sorting n-grams ===\n",
      "Reading /workspace/wav2vec-1b-cv8-ir/text.txt\n",
      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
      "****************************************************************************************************\n",
      "Unigram tokens 4378228 types 70781\n",
      "=== 2/5 Calculating and sorting adjusted counts ===\n",
      "Chain sizes: 1:849372 2:14475680768 3:27141902336 4:43427041280 5:63331106816\n",
      "Statistics:\n",
      "1 70780 D1=0.684187 D2=1.0538 D3+=1.37643\n",
      "2 652306 D1=0.766205 D2=1.12085 D3+=1.39031\n",
      "3 1669326 D1=0.84217 D2=1.20654 D3+=1.39941\n",
      "4 2514789 D1=0.896214 D2=1.29731 D3+=1.47431\n",
      "5 3053088 D1=0.794858 D2=1.47897 D3+=1.5117\n",
      "Memory estimate for binary LM:\n",
      "type     MB\n",
      "probing 164 assuming -p 1.5\n",
      "probing 192 assuming -r models -p 1.5\n",
      "trie     77 without quantization\n",
      "trie     42 assuming -q 8 -b 8 quantization \n",
      "trie     69 assuming -a 22 array pointer compression\n",
      "trie     34 assuming -a 22 -q 8 -b 8 array pointer compression and quantization\n",
      "=== 3/5 Calculating and sorting initial probabilities ===\n",
      "Chain sizes: 1:849360 2:10436896 3:33386520 4:60354936 5:85486464\n",
      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
      "####################################################################################################\n",
      "=== 4/5 Calculating and writing order-interpolated probabilities ===\n",
      "Chain sizes: 1:849360 2:10436896 3:33386520 4:60354936 5:85486464\n",
      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
      "####################################################################################################\n",
      "=== 5/5 Writing ARPA model ===\n",
      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
      "****************************************************************************************************\n",
      "Name:lmplz\tVmPeak:145097728 kB\tVmRSS:51788 kB\tRSSMax:25679020 kB\tuser:9.15304\tsys:14.1178\tCPU:23.2708\treal:20.9339\n"
     ]
    }
   ],
   "source": [
    "!../kenlm/build/bin/lmplz -o 5 <\"text.txt\" > \"5gram.arpa\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "id": "0e043b87",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"5gram.arpa\", \"r\") as read_file, open(\"5gram_correct.arpa\", \"w\") as write_file:\n",
    "  has_added_eos = False\n",
    "  for line in read_file:\n",
    "    if not has_added_eos and \"ngram 1=\" in line:\n",
    "      count=line.strip().split(\"=\")[-1]\n",
    "      write_file.write(line.replace(f\"{count}\", f\"{int(count)+1}\"))\n",
    "    elif not has_added_eos and \"<s>\" in line:\n",
    "      write_file.write(line)\n",
    "      write_file.write(line.replace(\"<s>\", \"</s>\"))\n",
    "      has_added_eos = True\n",
    "    else:\n",
    "      write_file.write(line)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "id": "d106c7d1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\\data\\\n",
      "ngram 1=70781\n",
      "ngram 2=652306\n",
      "ngram 3=1669326\n",
      "ngram 4=2514789\n",
      "ngram 5=3053088\n",
      "\n",
      "\\1-grams:\n",
      "-5.8501472\t<unk>\t0\n",
      "0\t<s>\t-0.11565505\n",
      "0\t</s>\t-0.11565505\n",
      "-5.4088216\tmiontuairisc\t-0.20133564\n",
      "-4.6517477\tcheartaitheach\t-0.24842946\n",
      "-2.1893916\tmaidir\t-1.7147961\n",
      "-2.1071756\tle\t-0.7007309\n",
      "-4.156014\tcoinbhinsiún\t-0.31064242\n",
      "-1.8876181\tar\t-0.9045828\n",
      "-4.62287\tdhlínse\t-0.24268326\n",
      "-1.6051095\tagus\t-0.8729715\n",
      "-4.1465816\taithint\t-0.21693327\n"
     ]
    }
   ],
   "source": [
    "!head -20 5gram_correct.arpa"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "id": "85ef4c43",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoProcessor\n",
    "\n",
    "processor = AutoProcessor.from_pretrained(\"./\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "id": "cb2a2768",
   "metadata": {},
   "outputs": [],
   "source": [
    "vocab_dict = processor.tokenizer.get_vocab()\n",
    "sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "id": "d19eee6f",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?\n",
      "Unigrams and labels don't seem to agree.\n"
     ]
    }
   ],
   "source": [
    "from pyctcdecode import build_ctcdecoder\n",
    "\n",
    "decoder = build_ctcdecoder(\n",
    "    labels=list(sorted_vocab_dict.keys()),\n",
    "    kenlm_model_path=\"5gram_correct.arpa\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "id": "4e8031a9",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import Wav2Vec2ProcessorWithLM\n",
    "\n",
    "processor_with_lm = Wav2Vec2ProcessorWithLM(\n",
    "    feature_extractor=processor.feature_extractor,\n",
    "    tokenizer=processor.tokenizer,\n",
    "    decoder=decoder\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "id": "6f32faf4",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/workspace/wav2vec-1b-cv8-ir/./ is already a clone of https://huggingface.co/jcmc/wav2vec-1b-cv8-ir. Make sure you pull the latest changes with `repo.git_pull()`.\n"
     ]
    }
   ],
   "source": [
    "from huggingface_hub import Repository\n",
    "\n",
    "repo = Repository(local_dir=\"./\", clone_from=\"jcmc/wav2vec-1b-cv8-ir\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "id": "a7e91068",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'/workspace/wav2vec-1b-cv8-ir'"
      ]
     },
     "execution_count": 129,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pwd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0a1de336",
   "metadata": {},
   "outputs": [],
   "source": [
    "processor_with_lm.save_pretrained(\"xls-r-300m-sv\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}