File size: 6,011 Bytes

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "04c8de09",
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "1eae750a",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/fr/7.0.0/fe20cac47c166e25b1f096ab661832e3da7cf298ed4a91dcaa1343ad972d175b)\n"
     ]
    }
   ],
   "source": [
    "dataset = load_dataset(\"mozilla-foundation/common_voice_7_0\", \"fr\", split=\"train\", use_auth_token=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "da1cfcaa",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c110c54654c045b9a2cbc6cad43fa685",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "0ex [00:00, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "chars_to_ignore_regex = '[^a-zàâäçéèêëîïôöùûüÿ\\'’ ]'\n",
    "\n",
    "def extract_text(batch):\n",
    "    batch[\"text\"] = re.sub(chars_to_ignore_regex, \"\", batch[\"sentence\"].lower()).replace('’', \"'\")\n",
    "    return batch\n",
    "\n",
    "dataset = dataset.map(extract_text, remove_columns=[\"sentence\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "bb306916",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d21bc14560b747f49105f598a2ffe2ff",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Pushing dataset shards to the dataset hub:   0%|          | 0/29 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "dataset.push_to_hub(f\"common_voice_7_0_fr_processed\", split=\"train\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "312d9c63",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"text_for_lm.txt\", \"w\") as file:\n",
    "    file.write(\" \".join(dataset[\"text\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "6d82daed",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"5gram.arpa\", \"r\") as read_file, open(\"5gram_correct.arpa\", \"w\") as write_file:\n",
    "    has_added_eos = False\n",
    "    for line in read_file:\n",
    "        if not has_added_eos and \"ngram 1=\" in line:\n",
    "            count=line.strip().split(\"=\")[-1]\n",
    "            write_file.write(line.replace(f\"{count}\", f\"{int(count)+1}\"))\n",
    "        elif not has_added_eos and \"<s>\" in line:\n",
    "            write_file.write(line)\n",
    "            write_file.write(line.replace(\"<s>\", \"</s>\"))\n",
    "            has_added_eos = True\n",
    "        else:\n",
    "            write_file.write(line)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "07ff4067",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoProcessor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "e75ab227",
   "metadata": {},
   "outputs": [],
   "source": [
    "processor = AutoProcessor.from_pretrained(\"./\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "604776b7",
   "metadata": {},
   "outputs": [],
   "source": [
    "vocab_dict = processor.tokenizer.get_vocab()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "ef4dd957",
   "metadata": {},
   "outputs": [],
   "source": [
    "sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "9a14839d",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading the LM will be faster if you build a binary file.\n",
      "Reading /home/pascal/kenlm/build/bin/xls-r-300m-lm-fr/language_model/5gram_correct.arpa\n",
      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
      "****************************************************************************************************\n"
     ]
    }
   ],
   "source": [
    "from pyctcdecode import build_ctcdecoder\n",
    "\n",
    "decoder = build_ctcdecoder(\n",
    "    labels=list(sorted_vocab_dict.keys()),\n",
    "    kenlm_model_path=\"./language_model/5gram_correct.arpa\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "656979ca",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import Wav2Vec2ProcessorWithLM\n",
    "\n",
    "processor_with_lm = Wav2Vec2ProcessorWithLM(\n",
    "    feature_extractor=processor.feature_extractor,\n",
    "    tokenizer=processor.tokenizer,\n",
    "    decoder=decoder\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "d2dd8891",
   "metadata": {},
   "outputs": [],
   "source": [
    "processor_with_lm.save_pretrained(\"xls-r-300m-lm-fr\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "85908c6d",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}