{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "0f0b980b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import transformers\n",
    "from datasets import load_dataset\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "55293a8b",
   "metadata": {},
   "outputs": [],
   "source": [
    "username = \"Plim\"  # change to your username\n",
    "target_lang = \"fr\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "3b5a735e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f230feb459c441a9a11e53b867e8914a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading:   0%|          | 0.00/2.60k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a4a8fa35d48f4a6db8072baed6b2389b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading:   0%|          | 0.00/29.6k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using custom data configuration en-fr-lang1=en,lang2=fr\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading and preparing dataset europarl_bilingual/en-fr (download: 278.07 MiB, generated: 643.66 MiB, post-processed: Unknown size, total: 921.72 MiB) to /workspace/.cache/huggingface/datasets/europarl_bilingual/en-fr-lang1=en,lang2=fr/8.0.0/2ab0200e7729616bfd4a4df6bfb29b31746ceb5a59f8c75c02ca35e1ebead950...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "aa00b5d6dc154449861dddcf9f0d2fc8",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading:   0%|          | 0.00/142M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "563096fc78454333b5ae23e87a7e3469",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading:   0%|          | 0.00/140M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "eba05e5151b34505b9a43e383cb6cfe0",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading:   0%|          | 0.00/9.30M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset europarl_bilingual downloaded and prepared to /workspace/.cache/huggingface/datasets/europarl_bilingual/en-fr-lang1=en,lang2=fr/8.0.0/2ab0200e7729616bfd4a4df6bfb29b31746ceb5a59f8c75c02ca35e1ebead950. Subsequent calls will reuse this data.\n"
     ]
    }
   ],
   "source": [
    "dataset = load_dataset(\"europarl_bilingual\", lang1=\"en\", lang2=target_lang, split=\"train\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "153d3a5e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_text(batch):\n",
    "    target_lang = \"fr\"\n",
    "    chars_to_ignore_regex = '[^a-zàâäçéèêëîïôöùûüÿ\\'’ ]'\n",
    "    text = batch[\"translation\"][target_lang]\n",
    "    batch[\"text\"] = re.sub(chars_to_ignore_regex, \"\", text.lower()).replace('’', \"'\")\n",
    "    return batch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "1f83c4d9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "00d998de52544f6c8750c53bc0c85d66",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "0ex [00:00, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "dataset = dataset.map(extract_text, remove_columns=dataset.column_names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "5c14f88e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "461a219cdb6d42b2b890ec028c336e7f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "dataset.push_to_hub(f\"{target_lang}_corpora_parliament_processed\", split=\"train\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "b2d7665a",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"text.txt\", \"w\") as file:\n",
    "    file.write(\" \".join(dataset[\"text\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "959cb6fa",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"language_model/5gram.arpa\", \"r\") as read_file, open(\"language_model/5gram_correct.arpa\", \"w\") as write_file:\n",
    "    has_added_eos = False\n",
    "    for line in read_file:\n",
    "        if not has_added_eos and \"ngram 1=\" in line:\n",
    "            count=line.strip().split(\"=\")[-1]\n",
    "            write_file.write(line.replace(f\"{count}\", f\"{int(count)+1}\"))\n",
    "        elif not has_added_eos and \"<s>\" in line:\n",
    "            write_file.write(line)\n",
    "            write_file.write(line.replace(\"<s>\", \"</s>\"))\n",
    "            has_added_eos = True\n",
    "        else:\n",
    "            write_file.write(line)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "3fdae043",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoProcessor\n",
    "\n",
    "processor = AutoProcessor.from_pretrained(\"./\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "2c3c2b49",
   "metadata": {},
   "outputs": [],
   "source": [
    "vocab_dict = processor.tokenizer.get_vocab()\n",
    "sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "103034b6",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading the LM will be faster if you build a binary file.\n",
      "Reading /workspace/xls-r-1b-cv_8-fr/language_model/5gram_correct.arpa\n",
      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
      "****************************************************************************************************\n"
     ]
    }
   ],
   "source": [
    "from pyctcdecode import build_ctcdecoder\n",
    "\n",
    "decoder = build_ctcdecoder(\n",
    "    labels=list(sorted_vocab_dict.keys()),\n",
    "    kenlm_model_path=\"language_model/5gram_correct.arpa\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "40dd5ada",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import Wav2Vec2ProcessorWithLM\n",
    "\n",
    "processor_with_lm = Wav2Vec2ProcessorWithLM(\n",
    "    feature_extractor=processor.feature_extractor,\n",
    "    tokenizer=processor.tokenizer,\n",
    "    decoder=decoder\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "754cd832",
   "metadata": {},
   "outputs": [],
   "source": [
    "processor_with_lm.save_pretrained(\"Plim/xls-r-1b-cv_8-fr\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "577d9617",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}