Add required scripts

Browse files

Files changed (11) hide show

.gitattributes +1 -0
README.md +0 -0
notes/.keep +0 -0
notes/data_preparation.ipynb +626 -0
notes/fa.tar.gz +3 -0
src/dictionary.py +664 -0
src/normalizer.py +227 -0
src/requirements.txt +3 -0
src/run_config.py +108 -0
src/run_persian.sh +51 -0
src/run_wav2vec2_pretrain_flax.py +638 -0

.gitattributes CHANGED Viewed

@@ -14,3 +14,4 @@
 *.pb filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text

 *.pb filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

File without changes

notes/.keep ADDED Viewed

File without changes

notes/data_preparation.ipynb ADDED Viewed

	@@ -0,0 +1,626 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import sys"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "['../src',\n '/Users/m3hrdadfi/Projects/HF/hfflax/hub/wav2vec2-base-persian/notes',\n '/Users/m3hrdadfi/.vscode/extensions/ms-toolsai.jupyter-2021.2.603412351/pythonFiles',\n '/Users/m3hrdadfi/.vscode/extensions/ms-toolsai.jupyter-2021.2.603412351/pythonFiles/lib/python',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python39.zip',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python3.9',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python3.9/lib-dynload',\n '',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python3.9/site-packages',\n '/Users/m3hrdadfi/Projects/Apps/zabanshenas',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python3.9/site-packages/IPython/extensions',\n '/Users/m3hrdadfi/.ipython']"
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sys.path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if \"../src\" not in sys.path:\n",
+    "    sys.path.insert(0, \"../src\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from normalizer import normalizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "سلام بر شما که می‌آیید و می‌آموزید که بی‌آرآیم \n",
+      "کتاب‌هایمان میدانی کجا‌ها ماه‌هاس که کی‌هامون و کیهان دنباله‌هاشون برای بهای هستند \n",
+      "میان‌‌افزار‌های امروزی نرم‌افزار سخت‌افزار امروز نوشت‌افزار‌ها \n",
+      "این کتاب بهترین در نوع شتر آسان‌تر هست \n",
+      "سه چیز هست که از پژوهش در این زمینه آموخته‌ام \n"
+     ]
+    }
+   ],
+   "source": [
+    "input_text = \"سلام بر شما که میآیید و میآموزید که بیآرآیم\"\n",
+    "print(normalizer({\"sentence\": input_text}, return_dict=False))\n",
+    "\n",
+    "input_text = \"کتابهایمان میدانی کجاها ماههاس که کیهامون و کیهان دنبالههاشون برای بهای هستند.\"\n",
+    "print(normalizer({\"sentence\": input_text}, return_dict=False))\n",
+    "\n",
+    "input_text = \" میانافزارهای امروزی نرمافزار سخت افزار امروز نوشتافزار ها\"\n",
+    "print(normalizer({\"sentence\": input_text}, return_dict=False))\n",
+    "\n",
+    "input_text = \"این کتاب بهترین در نوع شتر آسانتر هست\"\n",
+    "print(normalizer({\"sentence\": input_text}, return_dict=False))\n",
+    "\n",
+    "input_text = \"سه چیز هست که از پژوهش در این زمینه آموختهام\"\n",
+    "print(normalizer({\"sentence\": input_text}, return_dict=False))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !mkdir -p /home/m3hrdadfi/code/data\n",
+    "# %cd /home/m3hrdadfi/code/data\n",
+    "# !wget https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/fa.tar.gz && tar -xzf fa.tar.gz\n",
+    "# %cd /home/m3hrdadfi/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import os\n",
+    "\n",
+    "# lang = \"fa\"\n",
+    "# abs_path_to_data = os.path.join(f\"/home/m3hrdadfi/code/data/{lang}/dataset\", f\"cv{lang}\", lang)\n",
+    "# save_path = \"/\".join(abs_path_to_data.split('/')[:-2])\n",
+    "# print(abs_path_to_data)\n",
+    "# print(save_path)\n",
+    "# print()\n",
+    "# !ls {save_path}\n",
+    "# !ls {abs_path_to_data}/*.tsv"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def normalizer_without_batch(text, pruning=False):\n",
+    "    try:\n",
+    "        batch = {\n",
+    "            \"sentence\": text\n",
+    "        }\n",
+    "        text = normalizer(batch, return_dict=False)\n",
+    "        \n",
+    "        if pruning:\n",
+    "            if not len(text.split()) > 3:\n",
+    "                text = None\n",
+    "        \n",
+    "    except:\n",
+    "        print(text)\n",
+    "        text = None\n",
+    "        \n",
+    "    return text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from tqdm import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# test_df = pd.read_csv(f\"{abs_path_to_data}/test.tsv\", sep=\"\\t\")\n",
+    "\n",
+    "# print(f\"Step 0: {len(test_df)}\")\n",
+    "\n",
+    "# test_df[\"path\"] = abs_path_to_data + \"/clips/\" + test_df[\"path\"]\n",
+    "# test_df[\"status\"] = test_df[\"path\"].apply(lambda path: True if os.path.exists(path) else None)\n",
+    "# test_df = test_df.dropna(subset=[\"path\"])\n",
+    "# test_df = test_df.drop(\"status\", 1)\n",
+    "# print(f\"Step 1: {len(test_df)}\")\n",
+    "\n",
+    "# test_df[\"prev_sentence\"] = test_df[\"sentence\"]\n",
+    "# test_df[\"sentence\"] = test_df[\"sentence\"].apply(lambda t: normalizer_without_batch(t))\n",
+    "# test_df = test_df.dropna(subset=[\"sentence\"])\n",
+    "# print(f\"Step 2: {len(test_df)}\")\n",
+    "\n",
+    "# test_df = test_df[[\"prev_sentence\", \"sentence\", \"path\"]]\n",
+    "# test_df = test_df.drop_duplicates(subset=\"path\")\n",
+    "# print(f\"Step 3: {len(test_df)}\")\n",
+    "\n",
+    "# test_df = test_df.reset_index(drop=True)\n",
+    "# test_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# _train_df = pd.concat([\n",
+    "#     pd.read_csv(f\"{abs_path_to_data}/train.tsv\", sep=\"\\t\"),\n",
+    "#     pd.read_csv(f\"{abs_path_to_data}/dev.tsv\", sep=\"\\t\"),\n",
+    "# ])\n",
+    "# print(len(_train_df))\n",
+    "\n",
+    "# train_df = pd.concat([\n",
+    "#     pd.read_csv(f\"{abs_path_to_data}/train.tsv\", sep=\"\\t\"),\n",
+    "#     pd.read_csv(f\"{abs_path_to_data}/dev.tsv\", sep=\"\\t\"),\n",
+    "#     pd.read_csv(f\"{abs_path_to_data}/validated.tsv\", sep=\"\\t\"),\n",
+    "#     pd.read_csv(f\"{abs_path_to_data}/other.tsv\", sep=\"\\t\"),\n",
+    "# ])\n",
+    "# print(f\"Step 0: {len(train_df)}\")\n",
+    "\n",
+    "# train_df[\"path\"] = abs_path_to_data + \"/clips/\" + train_df[\"path\"]\n",
+    "# train_df[\"status\"] = train_df[\"path\"].apply(lambda path: True if os.path.exists(path) else None)\n",
+    "# train_df = train_df.dropna(subset=[\"path\"])\n",
+    "# train_df = train_df.drop(\"status\", 1)\n",
+    "# print(f\"Step 1: {len(train_df)}\")\n",
+    "\n",
+    "# train_df[\"prev_sentence\"] = train_df[\"sentence\"]\n",
+    "# train_df[\"sentence\"] = train_df[\"sentence\"].apply(lambda t: normalizer_without_batch(t, pruning=True))\n",
+    "# train_df = train_df.dropna(subset=[\"sentence\"])\n",
+    "# print(f\"Step 2: {len(train_df)}\")\n",
+    "\n",
+    "# train_df = train_df[[\"prev_sentence\", \"sentence\", \"path\"]]\n",
+    "# train_df = train_df.drop_duplicates(subset=\"path\")\n",
+    "# print(f\"Step 3: {len(train_df)}\")\n",
+    "\n",
+    "# train_df = train_df.sample(frac=1)\n",
+    "# train_df = train_df.reset_index(drop=True)\n",
+    "# train_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# from tqdm import tqdm\n",
+    "\n",
+    "# testset_indices = []\n",
+    "\n",
+    "# for index, row in tqdm(test_df.iterrows(), total=len(test_df), position=0):\n",
+    "#     _id = row[\"path\"]\n",
+    "#     finder = train_df[train_df[\"path\"] == _id]\n",
+    "#     if len(finder) > 0:\n",
+    "#         testset_indices.extend(list(finder.index))\n",
+    "\n",
+    "# testset_indices = list(set(testset_indices))\n",
+    "# print(f\"Found #{len(testset_indices)} test data\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# print(len(train_df))\n",
+    "# train_df = train_df.drop(testset_indices)\n",
+    "# print(len(train_df))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import pandas as pd\n",
+    "\n",
+    "# df = pd.concat([train_df, test_df], axis=0)\n",
+    "# # df = validated_df.copy()\n",
+    "# print(df.info())\n",
+    "# # df[\"sentence\"] = df[\"prev_sentence\"].apply(lambda t: normalizer_without_batch(t))\n",
+    "# # df = df.dropna(subset=[\"sentence\"])\n",
+    "# # df[\"sentence_spell\"] = df[\"sentence\"].apply(lambda t: normalizer({\"sentence\": t}, is_spell_check=True, return_dict=False))\n",
+    "# df = df.reset_index(drop=True)\n",
+    "# print(df.info())\n",
+    "# df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import torchaudio\n",
+    "# import librosa\n",
+    "# import IPython.display as ipd\n",
+    "# import numpy as np\n",
+    "\n",
+    "# def load_audio(path):\n",
+    "#     speech, sr = torchaudio.load(path)\n",
+    "#     speech = speech[0].numpy().squeeze()    \n",
+    "#     speech = librosa.resample(np.asarray(speech), sr, 16_000)\n",
+    "    \n",
+    "#     print(speech.shape, sr)\n",
+    "    \n",
+    "#     ipd.display(ipd.Audio(data=np.asarray(speech), autoplay=True, rate=16000))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# main_vocab = [\"ح\", \"چ\", \"ج\", \"ث\", \"ت\", \"پ\", \"ب\", \"آ\", \"ا\", \"ش\", \"س\", \"ژ\", \"ز\", \"ر\", \"ذ\", \"د\", \"خ\", \"ق\", \"ف\", \"غ\", \"ع\", \"ظ\", \"ط\", \"ض\", \"ص\", \"ی\", \"ه\", \"و\", \"ن\", \"م\", \"ل\", \"گ\", \"ک\"]\n",
+    "# text = \" \".join(df[\"sentence\"].values.tolist())\n",
+    "# vocab = list(sorted(set(text)))\n",
+    "\n",
+    "# for v in main_vocab:\n",
+    "#     if v not in vocab:\n",
+    "#         print(\"v\", v)\n",
+    "\n",
+    "# print(len(main_vocab), len(vocab))\n",
+    "# print(len(vocab), vocab)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import numpy as np\n",
+    "\n",
+    "\n",
+    "# idx = np.random.randint(0, len(df))\n",
+    "# # idx = 6140\n",
+    "# sample = df.iloc[idx]\n",
+    "# ipd.display(sample)\n",
+    "# # print(sample.iloc[idx][\"prev_sentence\"])\n",
+    "# print()\n",
+    "# print(sample[\"prev_sentence\"])\n",
+    "# print(sample[\"sentence\"])\n",
+    "# print()\n",
+    "# load_audio(sample[\"path\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# new_train_df = train_df.copy()\n",
+    "# new_train_df[\"_path\"] = new_train_df[\"path\"]\n",
+    "# new_train_df[\"path\"] = new_train_df[\"path\"].apply(lambda t: os.path.join(\"/home/m3hrdadfi/code/data/fa/dataset/clips\", t.split(\"/\")[-1]))\n",
+    "# print(new_train_df.info())\n",
+    "# new_train_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# new_test_df = test_df.copy()\n",
+    "# new_test_df[\"_path\"] = new_test_df[\"path\"]\n",
+    "# new_test_df[\"path\"] = new_test_df[\"path\"].apply(lambda t: os.path.join(\"/home/m3hrdadfi/code/data/fa/dataset/clips\", t.split(\"/\")[-1]))\n",
+    "# print(new_test_df.info())\n",
+    "# new_test_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import shutil\n",
+    "# from tqdm import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !mkdir -p {save_path}/clips\n",
+    "# !mkdir -p {save_path}/augs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# for index, row in tqdm(new_train_df.iterrows(), position=0, total=len(new_train_df)):\n",
+    "#     shutil.copy(row[\"_path\"], row[\"path\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# for index, row in tqdm(new_test_df.iterrows(), position=0, total=len(new_test_df)):\n",
+    "#     shutil.copy(row[\"_path\"], row[\"path\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # aug_train_df = new_train_df.copy()\n",
+    "# aug_train_df = new_train_df.sample(frac=0.1)\n",
+    "# aug_train_df = aug_train_df.reset_index(drop=True)\n",
+    "# aug_train_df[\"_path\"] = aug_train_df[\"path\"]\n",
+    "# aug_train_df[\"path\"] = aug_train_df[\"path\"].apply(lambda t: \"/\".join(t.split('.')[:-1]).replace(\"clips\", \"augs\") + \"_aug.mp3.wav\")\n",
+    "# print(aug_train_df.info())\n",
+    "# aug_train_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# print(aug_train_df.iloc[0][\"_path\"])\n",
+    "# print(aug_train_df.iloc[0][\"path\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # augmentation\n",
+    "\n",
+    "# from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift, Gain\n",
+    "# import numpy as np\n",
+    "# import soundfile as sf\n",
+    "\n",
+    "# augment = Compose([\n",
+    "# #     AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),\n",
+    "# #     PitchShift(min_semitones=-1, max_semitones=2, p=0.2),\n",
+    "# #     Gain(min_gain_in_db=-6, max_gain_in_db=6, p=0.8)\n",
+    "#     AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),\n",
+    "#     TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),\n",
+    "#     PitchShift(min_semitones=-4, max_semitones=4, p=0.5),\n",
+    "# ])\n",
+    "\n",
+    "# def augmented_speech_file_to_array_fn(in_path, out_path):\n",
+    "#     speech_array, sampling_rate = torchaudio.load(in_path)\n",
+    "#     speech_array = speech_array.squeeze().numpy()\n",
+    "#     speech_array = augment(samples=speech_array, sample_rate=sampling_rate)\n",
+    "#     sf.write(out_path, speech_array, sampling_rate, \"PCM_24\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # for index, row in tqdm(aug_train_df.iterrows(), position=0, total=len(aug_train_df)):\n",
+    "# #     augmented_speech_file_to_array_fn(row[\"_path\"], row[\"path\"])\n",
+    "# !ls"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # new_train_aug_df = pd.concat([new_train_df, aug_train_df], axis=0)\n",
+    "# new_train_aug_df = new_train_df.copy()\n",
+    "# new_train_aug_df = new_train_aug_df.sample(frac=1)\n",
+    "# new_train_aug_df = new_train_aug_df.reset_index(drop=True)\n",
+    "# print(new_train_aug_df.info())\n",
+    "# new_train_aug_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# new_train_df.to_csv(f\"{save_path}/train_no_aug.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)\n",
+    "# new_train_aug_df.to_csv(f\"{save_path}/train_with_aug.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)\n",
+    "# new_test_df.to_csv(f\"{save_path}/test.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# new_train_df.count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# new_test_df.count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import pandas as pd\n",
+    "\n",
+    "# import os\n",
+    "# from tqdm import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# train_df = pd.read_csv(f\"{save_path}/train_no_aug.csv\", sep=\"\\t\")\n",
+    "# print(train_df.info())\n",
+    "# train_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# test_df = pd.read_csv(f\"{save_path}/test.csv\", sep=\"\\t\")\n",
+    "# print(test_df.info())\n",
+    "# test_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# non_existed_train = []\n",
+    "\n",
+    "# for index, row in tqdm(train_df.iterrows(), total=len(train_df), position=0):\n",
+    "#     if not os.path.exists(row[\"path\"]):\n",
+    "#         non_existed_train.extends(list(index))\n",
+    "#         break"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import numpy as np\n",
+    "\n",
+    "\n",
+    "# idx = np.random.randint(0, len(train_df))\n",
+    "# # idx = 6140\n",
+    "# sample = train_df.iloc[idx]\n",
+    "# ipd.display(sample)\n",
+    "# # print(sample.iloc[idx][\"prev_sentence\"])\n",
+    "# print()\n",
+    "# print(sample[\"prev_sentence\"])\n",
+    "# print(sample[\"sentence\"])\n",
+    "# print()\n",
+    "# load_audio(sample[\"path\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# train_df_half = train_df.copy()\n",
+    "# print(train_df_half.shape)\n",
+    "# train_df_half = train_df_half.dropna()\n",
+    "# print(train_df_half.shape)\n",
+    "# train_df_half = train_df_half.drop_duplicates()\n",
+    "# print(train_df_half.shape)\n",
+    "\n",
+    "# train_df_half = train_df_half.sample(frac=0.5)\n",
+    "# train_df_half = train_df_half.reset_index(drop=True)\n",
+    "# print(train_df_half.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# train_df_half.to_csv(f\"{save_path}/train_no_aug_half.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "transformers",
+   "name": "transformers"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.4"
+  },
+  "orig_nbformat": 2
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

notes/fa.tar.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9f3c53202d7d12dfe973604737fc11b0a50c9c94b85c4cae70fcc693fe2babb4
+size 7020110

src/dictionary.py ADDED Viewed

	@@ -0,0 +1,664 @@

+dictionary_mapping = {
+    'ك': 'ک', 'دِ': 'د', 'بِ': 'ب', 'زِ': 'ز', 'ذِ': 'ذ', 'شِ': 'ش', 'سِ': 'س', 'ى': 'ی',
+    'ي': 'ی', 'أ': 'ا', 'ؤ': 'و', "ے": "ی", "ۀ": "ه", "ﭘ": "پ", "ﮐ": "ک", "ﯽ": "ی",
+    "ﺎ": "ا", "ﺑ": "ب", "ﺘ": "ت", "ﺧ": "خ", "ﺩ": "د", "ﺱ": "س", "ﻀ": "ض", "ﻌ": "ع",
+    "ﻟ": "ل", "ﻡ": "م", "ﻢ": "م", "ﻪ": "ه", "ﻮ": "و", 'ﺍ': "ا", 'ة': "ه",
+    'ﯾ': "ی", 'ﯿ': "ی", 'ﺒ': "ب", 'ﺖ': "ت", 'ﺪ': "د", 'ﺮ': "ر", 'ﺴ': "س", 'ﺷ': "ش",
+    'ﺸ': "ش", 'ﻋ': "ع", 'ﻤ': "م", 'ﻥ': "ن", 'ﻧ': "ن", 'ﻭ': "و", 'ﺭ': "ر", "ﮔ": "گ",
+    "a": "‌ای‌", "b": "‌بی‌", "c": "‌سی‌", "d": "‌دی‌", "e": "‌ایی‌", "f": "‌اف‌",
+    "g": "‌جی‌", "h": "‌اچ‌", "i": "‌آی‌", "j": "‌جی‌", "k": "‌کی‌", "l": "‌ال‌",
+    "m": "‌ام‌", "n": "‌ان‌", "o": "‌او‌", "p": "‌پی‌", "q": "‌کیو‌", "r": "‌آر‌",
+    "s": "‌اس‌", "t": "‌تی‌", "u": "‌یو‌", "v": "‌وی‌", "w": "‌دبلیو‌", "x": "‌اکس‌",
+    "y": "‌وای‌", "z": "‌زد ",
+    "\u200c": " ", "\u200d": " ", "\u200e": " ", "\u200f": " ", "\ufeff": " ",
+    "نو آوری‌مان": "نو‌آوری‌مان",
+    "نو آوری مان": "نو‌آوری‌مان",
+    "نو آوریمان": "نو‌آوری‌مان",
+    " ا م ": "‌ام ",
+    " م ": "‌ام ",
+    "کنندهای": "کننده‌ای",
+    "ارائهای": "ارائه‌ای",
+    "ایدهای": "ایده‌ای",
+    "ماسهای": "ماسه‌ای",
+    "خامنهای": "خامنه‌ای",
+    "قلهای": "قله‌ای",
+    "سیارهای": "سیاره‌ای",
+    "کیسهای": "کیسه‌ای",
+    "شانهای": "شانه‌ای",
+    "غریبهای": "غریبه‌ای",
+    "برنامهای": "برنامه‌ای",
+    "سختگیرانهای": "سختگیرانه‌ای",
+    "بهانهای": "بهانه‌ای",
+    "زیرروالهای": "زیر روالهای",
+    "درهای": "دره‌ای",
+    "آمادهای": "آماده‌ای",
+    "سادهای": "ساده‌ای",
+    "سرمایهگذارهای": "سرمایه گذارهای",
+    "فوقالعادهای": "فوق‌العاده‌ای",
+    "حادثهای": "حادثه‌ای",
+    "نویسندههای": "نویسنده‌های",
+    "علاقهای": "علاقه‌ای",
+    "برجستهای": "برجسته‌ای",
+    "جلگهای": "جلگه‌ای",
+    "زندهای": "زنده‌ای",
+    "فنآوریهای": "فناوری‌های",
+    "سایهروشنهای": "سایه روشن‌های",
+    "بیسابقهای": "بی سابقه‌ای",
+    "فرضیهای": "فرضیه‌ای",
+    "راهاندازهای": "راه اندازهای",
+    "بیشهای": "بیشه‌ای",
+    "مقالهای": "مقاله‌ای",
+    "دیگهای": "دیگه‌ای",
+    "ماههاست": "ماه هاست",
+    "نرمافزارهای": "نرم‌افزارهای",
+    "کتابسوزانهای": "کتاب سوزان‌های",
+    "سیستمعاملهای": "سیستم عامل‌های",
+    "اسلحهای": "اسلحه‌ای",
+    "وقفهای": "وقفه‌ای",
+    "زمینهای": "زمینه‌ای",
+    "حرامزادههای": "حرامزاده‌های",
+    "هزینهای": "هزینه‌ای",
+    "انداختهای": "انداخته‌ای",
+    "جسورانهای": "جسورانه‌ای",
+    "فاجعهای": "فاجعه‌ای",
+    "جامعهای": "جامعه‌ای",
+    "پدیدهای": "پدیده‌ای",
+    "اغواگرانهای": "اغواگرانه‌ای",
+    "تکانهای": "تکانه‌ای",
+    "لولهای": "لوله‌ای",
+    "نشانهای": "نشانه‌ای",
+    "وسیلهای": "وسیله‌ای",
+    "آیندهای": "آینده‌ای",
+    "بردهای": "برده‌ای",
+    "سابقهای": "سابقه‌ای",
+    "ناحیهای": "ناحیه‌ای",
+    "تکاندهندهای": "تکان دهنده‌ای",
+    "بودجهای": "بودجه‌ای",
+    "روزانهای": "روزانه‌ای",
+    "چارهای": "چاره‌ای",
+    "انگیزهای": "انگیزه‌ای",
+    "دادهای": "داده‌ای",
+    "عدهای": "عده‌ای",
+    "هفتهای": "هفته‌ای",
+    "منطقهای": "منطقه‌ای",
+    "استارتآپهای": "استارتاپ‌های",
+    "سازهای": "سازه‌ای",
+    "مجموعهای": "مجموعه‌ای",
+    "فلسفهای": "فلسفه‌ای",
+    "تذکردهندهای": "تذکر دهنده‌ای",
+    "مصاحبهای": "مصابحه‌ای",
+    "نمونهای": "نمونه‌ای",
+    "قلمموهای": "قلم مو‌های",
+    "شبزندهداری": "شب زنده‌داری",
+    "خوردهباشد": "خورده باشد",
+    "داشتهباشید": "داشته باشید",
+    "فزایندهای": "فزاینده‌ای",
+    "عمدهای": "عمده‌ای",
+    "بدیهایی": "بدی‌های",
+    "نوشت‌هایم": "نوشته‌ایم",
+    "بنتالهدی": "بنت الهدی",
+    "نوشتهام": "نوشته‌ام",
+    "سرمایهگذاران": "سرمایه گذاران",
+    "خانهی": "خانه‌ی",
+    "گستاخانهی": "گستاخانه‌ی",
+    "گرفتهباشیم": "گرفته باشیم",
+    "خونهی": "خونه‌ی",
+    "داشتهام": "داشته‌ام",
+    "رشتهام": "رشته‌ام",
+    "سرمایهگذارانشان": "سرمایه گذارانشان",
+    "ریشهکنی": "ریشه‌کنی",
+    "مودبانهتری": "مودبانه‌تری",
+    "برگردانشدهاند": "برگردان شده‌اند",
+    "قرمهسبزی": "قرمه‌سبزی",
+    "راهجویی": "راه جویی",
+    "اماهیچوقت": "اما هیچوقت",
+    "آبوهوای": "آب و هوای",
+    "بقیهاش": "بقیه‌اش",
+    "طبقهبندی": "طبقه‌بندی",
+    "مردههان": "مرده هان",
+    "آمادهاند": "آماده‌اند",
+    "نشدهاید": "نشده‌اید",
+    "آگاهیرسانی": "آگاهی رسانی",
+    "نداشتهاند": "نداشته‌اند",
+    "شکنانهترین": "شکنانه‌ترین",
+    "اقدامهایی": "اقدام‌هایی",
+    "راهآهن": "راه آهن",
+    "شدهاند": "شده‌اند",
+    "تازهترین": "تازه‌ترین",
+    "روبهروی": "رو به روی",
+    "منحصربهفرد": "منحصر به فرد",
+    "سیزدهبدر": "سیزده بدر",
+    "برندهی": "برنده‌ی",
+    "خانهاشتراکی": "خانه اشتراکی",
+    "دادههایی": "داده‌هایی",
+    "استفادهتر": "استفاده‌تر",
+    "گذرنامهتان": "گذرنامه‌تان",
+    "کهنترین": "کهنه‌ترین",
+    "فرهنگسرا": "فرهنگ‌سرا",
+    "آمادهاید": "آماده‌اید",
+    "ویژهی": "ویژه‌ی",
+    "غریزهات": "غریزه‌ات",
+    "مادرشوهری": "مادر شوهری",
+    "نبودهام": "نبوده‌ام",
+    "بودهاند": "بوده‌اند",
+    "وتنها": "و تنها",
+    "بداههکاری": "بداهه‌کاری",
+    "سرمایهگذار": "سرمایه گذار",
+    "برنامهنویس": "برنامه نویس",
+    "مهنازخانم": "مهناز خانم",
+    "مواجهاند": "مواجه‌اند",
+    "توسعهاش": "توسعه‌اش",
+    "سینهام": "سینه‌ام",
+    "سین‌هام": "سینه‌ام",
+    "نمیخواهند": "نمیخواهند",
+    "فنآوری‌ها": "فناوری‌ها",
+    "دنبالهرو": "دنباله‌رو",
+    "لبهی": "لبه‌ی",
+    "اللهیار": "الله یار",
+    "ارزندهتر": "ارزنده‌تر",
+    "برههای": "بره‌ای",
+    "پیادهسازی": "پیاده‌سازی",
+    "دهسالگی": "ده سالگی",
+    "رسانهای": "رسانه‌ای",
+    "ریشسفیدها": "ریش سفید‌ها",
+    "چهجوری": "چه جوری",
+    "ویژگیهایی": "ویژگی‌هایی",
+    "می‌فهمی‌م": "میفهمیم",
+    "وبهم": "و بهم",
+    "قطرهای": "قطره‌ای",
+    "ازتنهایی": "از تنهایی",
+    "لطیفهای": "لطیفه‌ای",
+    "باشهاومدم": "باشه اومدم",
+    "منحصربهفردترین": "منحصر به فرد‌ترین",
+    "کردهاند": "کرده‌اند",
+    "اندازهای": "اندازه‌ای",
+    "بهرهبرداری": "بهره برداری",
+    "اماشوهرجان": "اما شوهر جان",
+    "خانوادهاش": "خانواده‌اش",
+    "نشدهاند": "نشده‌اند",
+    "نکردهایم": "نکرده‌ایم",
+    "تخممرغ‌هایش": "تخم مرغ‌هایش",
+    "وظیفهش": "وظیفه‌اش",
+    "مشگینشهر": "مشگی شهر",
+    "توسعهدهندگانش": "توسعه دهندگانش",
+    "امینابراهیم": "امین ابراهیم",
+    "دربارهاش": "درباره‌اش",
+    "میانافزارها": "میان‌افزارها",
+    "دیدهاند": "دیده‌اند",
+    "خانوادهام": "خانواده‌ام",
+    "مایهی": "مایه‌ی",
+    "نوشتهشدن": "نوشته شدن",
+    "راهحل‌هایشان": "راه حل‌هایشان",
+    "میهماننواز": "میهمان نواز",
+    "زیبندهی": "زیرنده‌ی",
+    "راههایی": "راه‌هایی",
+    "جربزهی": "جربزه‌ی",
+    "بهجا": " به جا",
+    "بطورهمزمان": "به طور همزمان",
+    "فهمیدهبود": "فهمیده بود",
+    "دوربرگردان‌ها": "دور برگردان‌ها",
+    "شالودهی": "شالوده‌ی",
+    "راهکاریی": "راه‌کاری",
+    "مخالفتهایی": "مخالفت‌هایی",
+    "چیزهاازشون": "چیزها ازشون",
+    "سکونتگاه‌های": "سکونت گاه‌های",
+    "سالهابود": "سال‌ها بود",
+    "نمونهی": "نمونه‌ی",
+    "سرمایهگذاری": "سرمایه گذاری",
+    "شبکهای": "شبکه‌ای",
+    "خواهرشوهر": "خواهر شوهر",
+    "سرگیجهآور": "سرگیجه آور",
+    "آستانهی": "آستانه‌ی",
+    "دادهاست": "داده است",
+    "مجسمهسازی": "مجسمه سازی",
+    "ماهرانهترین": "ماهرانه‌ترین",
+    "پنجشنبههایی": "پنجشنبه شب‌هایی",
+    "نرفنهام": "نرفته‌ام",
+    "قورمهسبزی": "قورمه سبزی",
+    "گذارهای": "گذاره‌ای",
+    "بندهخدا": "بنده خدا",
+    "روزنامهنگاران": "روزنامه نگاران",
+    "نقشهی": "نقشه‌ی",
+    "حملهی": "حمله‌ی",
+    "تکنیکهاست": "تکنیک هاست",
+    "نرمافزارهایمان": "نرم‌افرارهایمان",
+    "مادرشوهرم": "مادر شوهرم",
+    "ماهگیمون": "ماه گیمون",
+    "مادرشوهرمحترم": "مادر شوهر محترم",
+    "شوهرداری": "شوهر داری",
+    "سرمایهگذارها": "سرمایه گذارها",
+    "بهرهمند": "بهره‌مند",
+    "درمانهایی": "درمان‌هایی",
+    "عامدانهتر": "عامدانه‌تر",
+    "تازهوارد": "تازه وارد",
+    "مونتهویدئو": "مونته ویدئو",
+    "ذائق‌هاش": "ذائقه‌اش",
+    "گوشهگیرتر": "گوشه‌گیرتر",
+    "دنبالهدار": "دنباله‌دار",
+    "بیخانمان‌ها": "بی‌خانمان‌ها",
+    "سرمایهدارها": "سرمایه‌دارها",
+    "مادرشوهریم": "مادر شوهریم",
+    "صبحان‌هاش": "صبحانه‌اش",
+    "جنازهست": "جنازه است",
+    "شمارهات": "شماره‌ای",
+    "بهقدری": "به قدری",
+    "کیسهی": "کیسه‌ی",
+    "کوششهایی": "کوشش‌هایی",
+    "مادرشوهر": "مادر شوهر",
+    "رابطهی": "رابطه‌ی",
+    "نوشتهاند": "نوشته‌اند",
+    "کنجکاوانهی": "کنجکاوانه‌ی",
+    "غیرمتعهد": "غیر متعهد",
+    "کردهای": "کرده‌ای",
+    "وهمکارانم": "و همکارانم",
+    "گردهمآیی": "گردهمایی",
+    "اللهوردی": "الله وردی",
+    "صرفهجویی": "صرفه جویی",
+    "ماندهاند": "مانده‌اند",
+    "برنامهنویسی": "برنامه‌نویسی",
+    "امینمهدی": "امین مهدی",
+    "سهامدارنی": "سهام دارانی",
+    "مسابقهی": "مسابقه‌ی",
+    "ستارهشناسم": "ستار شناسم",
+    "گرفتهاند": "گرفته‌اند",
+    "جامعهشان": "جامعه‌شان",
+    "بچهی": "بچه‌ی",
+    "شیوهی": "شیوه‌ی",
+    "بهکار": "به کار",
+    "بهتراست": "بهتر است",
+    "سروکلهشون": "سر و کلهشون",
+    "رسیدهمسرش": "رسید همسرش",
+    "پسراهل": "پسر اهل",
+    "پروژههای": "پروژه‌های",
+    "عاقلان‌هام": "عاقلانه‌ام",
+    "گذاشتهاند": "گذاشته‌اند",
+    "کردهام": "کرده‌ام",
+    "اندازهگیری": "اندازه گیری",
+    "یاوهگویی": "یاوه گویی",
+    "سازمانهایی": "سازمان‌هایی",
+    "نمودهاند": "نموده‌اند",
+    "تنهاییآور": "تنهایی آور",
+    "قراردهیم": "قرار دهیم",
+    "ازشوهرجان": "از شوهر جان",
+    "کرهجنوبی": "کره جنوبی",
+    "توهینآمیز": "توهین آمیز",
+    "فنآوریهایی": "فناوری‌هایی",
+    "داشتهاید": "داشته‌اید",
+    "شدهایم": "شده‌ایم",
+    "نمیفهمم": "نمیفهمم",
+    "مثالهایی": "مثال‌هایی",
+    "رییسجمهور": "رییس جمهور",
+    "مجموعهی": "مجموعه‌ی",
+    "درندهاند": "درنده‌اند",
+    "امابهش": "اما بهش",
+    "بازخواهند": "باز خواهند",
+    "برنامههایی": "برنامه‌هایی",
+    "یهجا": "یه جا",
+    "زگیلهایی": "زگیل‌هایی",
+    "وسیلهی": "وسیله‌ی",
+    "بهمنیار": "بهمن یار",
+    "دادهام": "داده‌ام",
+    "بههنگام": "به هنگام",
+    "بهدروغ": "به دروغ",
+    "دورافتادهترین": "دور افتاده‌ترین",
+    "نامهایی": "نامه‌ایی",
+    "سهقسمتی": "سه قسمتی",
+    "توجهازچیدن": "توجه از چیدن",
+    "پیامرسان‌ها": "پیام رسان‌ها",
+    "بهمنزاد": "بهمن زاد",
+    "نشانههایی": "نشانه‌هایی",
+    "راهحل‌های": "راه حل‌های",
+    "راهحلهایی": "راه حل‌هایی",
+    "راهحلهای": "راه حل‌های",
+    "نظرخواهی‌ها": "نظر خواهی‌ها",
+    "نظرخواهیها": "نظر خواهی‌ها",
+    "کندهی": "کنده‌ی",
+    "حرامزاده‌های": "حرام زاده‌های",
+    "شبیهسازیهایی": "شبیه سازی‌هایی",
+    "مهارتهایی": "مهارت‌هایی",
+    "روبهرویشان": "رو به رویشان",
+    "برجستهترین": "برجسته‌ترین",
+    "نمیفهمیدم": "نمیفهمیدم",
+    "دستگاههایی": "دستگاه‌هایی",
+    "برادرشوهر": "برادر شوهر",
+    "گرسن‌هام": "گرسته‌ام",
+    "گرسنههام": "گرسته‌ام",
+    "قهوهخوری": "قهوه خوری",
+    "دادهاید": "داده‌اید",
+    "بهآرامی": "به آرمانی",
+    "دانستنیهاست": "دانستنی‌هاست",
+    "بهراحتی": "به راحتی",
+    "ایدهپردازی": "ایده‌پردازی",
+    "ریشسفیدهای": "ریش سفید‌های",
+    "خفهمون": "خفه مون",
+    "بهجای": "به جای",
+    "ریزخشونت‌ها": "ریز خشونت‌ها",
+    "ریزخشونتها": "ریز خشونت‌ها",
+    "حساسیتهایی": "حساسیت‌هایی",
+    "پشتصحنهی": "پشت صحنه‌ی",
+    "کلهی": "کله‌ی",
+    "تاشوهرم": "تا شوهرم",
+    "آیندهاش": "آینده‌اش",
+    "پروانههایی": "پروانه‌هایی",
+    "خوبیهایی": "خوبی‌هایی",
+    "نرمافزارها": "نرم‌افزارها",
+    "رساندهاند": "رسانده‌اند",
+    "سرمایهگذارنی": "سرمایه گذارانی",
+    "تکهچسبانی": "تکه چسبانی",
+    "بیتوجهی": "بی توجهی",
+    "جاهطلبی": "جاه طلبی",
+    "پرغلغلهتان": "پر غلغله‌تان",
+    "خمینیشهر": "خمینی شهر",
+    "رشتهتوییت": "رشته توییت",
+    "موهبتهایی": "موهبت‌هایی",
+    "برنامهی": "برنامه‌ی",
+    "مادرشوهردارم": "مادر شوهر داردم",
+    "سیاهپوستان": "سیاه پوستان",
+    "شرکتهایی": "شرکت‌هایی",
+    "نیاوردهاند": "نیاورده‌اند",
+    "آنهم": "آن هم",
+    "شوهرداریم": "شوهر داریم",
+    "یکچهارم": "یک چهارم",
+    "پروندههاست": "پرونده هاست",
+    "برنامهت": "برنامه‌ات",
+    "چروکیدهمان": "چروکیده‌مان",
+    "زمینهسازی": "زمینه سازی",
+    "زدهاند": "زده‌اند",
+    "اظهارنظرپرداختن": "اظهار نظر پرداختن",
+    "صلحطلبانهترین": "صلح طلبانه‌ترین",
+    "بهغلط": "به غلط",
+    "ایدهآلم": "ایده آلم",
+    "سیاهکاران": "سیاه کاران",
+    "امیرابراهیم": "امیر ابراهیم",
+    "توسعهدهندگان": "توسعه دهندگان",
+    "لحظهی": "لحظه‌ی",
+    "امینطاها": "امین طاها",
+    "بینالنهرین": "بین النهرین",
+    "نیمهوقت": "نیمه وقت",
+    "پیادهروی": "پیاده روی",
+    "آلودهاند": "آلوده‌اند",
+    "گریهکرد": "گره کرد",
+    "نعمتهایی": "نعمت‌هایی",
+    "مادرشوهرشماهم": "مادر شوهر شما هم",
+    "آشپزخونهاس": "آشپزخونه‌اس",
+    "مسابقهها": "مسابقه‌ها",
+    "مسابقهای": "مسابقه‌های",
+    "برنامهریزی": "برنامه‌ریزی",
+    "بازخواهید": "باز خواهید",
+    "جوییما": "جویی ما",
+    "آماده ایم": "آماده‌ایم",
+    "مدلسازی": "مدل‌سازی",
+    "درصورتیکه": "در صورتیکه",
+    "آمریکاییات": "آمریکایی‌ات",
+    "مادریاش": "مادری‌اش",
+    "غافلگیرکننده": "غافلگیر کننده",
+    "پیکرتراشی": "پیکر تراشی",
+    "اذیتوآزار": "اذیت و آزار",
+    "امتیازاورترین": "امتیاز آور",
+    "جیکجیک": "جیک جیک",
+    "تاشب": "تا شب",
+    "کپیرایت": "کپی رایت",
+    "آنتیبادی": "آنتی بادی",
+    "عجیبتر": "عجیب‌تر",
+    "استانداردسازی": "استاندارد سازی",
+    "هشتادوهشت": "هشتاد و هشت",
+    "متنوعتر": "متنوع‌تر",
+    "منظورانجام": "منظور انجام",
+    "نگرانکننده‌ترین": "نگران کننده‌ترین",
+    "شگفتانگیز": "شگفت انگیز",
+    "رنگینپوست": "رنگین پو��ت",
+    "فارغ التحصیلان": "فارغ‌التحصیلان",
+    "ترسناکتر": "ترسناک‌تر",
+    "لا رامبلا": "لارامبلا",
+    "پرجمعیتترین": "پرجمعیت‌ترین",
+    "درمیآیند": "درمی‌آیند",
+    "باشمالکی": "باشم الکی",
+    "وسیعتر": "وسیع‌تر",
+    "فاحشهخانه": "فاحشه خانه",
+    "بااحتیاط": "با احتیاط",
+    "قانعکننده": "قانع‌کننده",
+    "انعطافپذیری": "انعطاف‌پذیری",
+    "بیتالمقدس": "بیت‌المقدس",
+    "اوپناستریتمپ": "اوپن استریت مپ",
+    "روزابارونی": "روزا بارونی",
+    "محافظهکارانه": "محافظه کارانه",
+    "فوتبالدستی": "فوتبال دستی",
+    "توسعهدهنده": "توسعه دهنده",
+    "قانونگزاران": "قانون گزاران",
+    "العسریسرا": "العسر یسرا",
+    "خارقالعاده": "خارق‌العاده",
+    "بیماریمزمن": "بیماری مزمن",
+    "بادوستانتان": "با دوستانتان",
+    "برابربیشتر": "برابر بیشتر",
+    "ارائهدهنده": "ارائه دهنده",
+    "طوفانزدگان": "طوفان زندگان",
+    "امینمحمد": "امین محمد",
+    "محیطزیست": "محیط زیست",
+    "شقیترینشان": "شقی‌ترینشان",
+    "بودواقعا": "بود واقعا",
+    "نیویورکتایمز": "نیویورک تایمز",
+    "ریودوژانیرو": "ریو دو ژانیرو",
+    "مشترکالمنافع": "مشترک‌المنافع",
+    "اسلایدسازم": "اسلاید سازم",
+    "نمیآوریدش": "نمی‌آوریدش",
+    "بینالملل": "بین‌الملل",
+    "مصرفکنندگان": "مصرف کنندگان",
+    "امینالدین": "امین الدین",
+    "امریکااینقدر": "امریکا اینقدر",
+    "بعضیاوقات": "بعضی اوقات",
+    "خاطربچه": "خاطر بچه",
+    "ایناکیلویی": "اینا کیلویی",
+    "ویکیپدیا": "ویکی‌پدیا",
+    "مافکرمیکنیم": "ما فکر میکنیم",
+    "انگلیسیزبان": "انگلیسی زبان",
+    "کلهشون": "کله‌شون",
+    "آدمبزرگی": "آرم بزرگی",
+    "مر آت مر آه": "مر‌آت مر‌آت",
+    "آسیبزد": "آسیب زد",
+    "آیآرسی": "آی آرسی",
+    "آسیااقیانوسیه": "آسیا اقیانوسیه",
+    "آیای": "آیا",
+    "میانجنسی": "میان جنسی",
+    "میاننسلی": "میان نسلی",
+    "میان‌افزار‌ها": "میان افزارها",
+    "آییننامه": "آیین‌نامه",
+    "ارائهشده": "ارائه‌شده",
+    "اشپزخونه": "آشپزخونه",
+    "اماعلتشونمیپرسه": "اما علتشو نمیپرسه",
+    "امیدوارکننده": "امیدوار کننده",
+    "ایالاتمتحده": "ایالات متحده",
+    "بااینکه": "با اینکه",
+    "بلندپروازانه": "بلند پروازانه",
+    "بهترازاینه": "بهتر از اینه",
+    "بهدست‌آمده": "به دست‌آمده",
+    "بهوسیله": "به وسیله",
+    "بیادبانه": "بی ادبانه",
+    "بیاندازه": "بی اندازه",
+    "بیصبرانه": "بی صبرانه",
+    "بیفایده": "بی فایده",
+    "بیمهره": "بی مهره",
+    "بینظیره": "بی نظیره",
+    "تاریخزده": "تاریخ زده",
+    "تهرانزده": "تهران زده",
+    "تولیدشده": "تولید شده",
+    "تولیدکننده": "تولید کننده",
+    "تکمیلشده": "تکمیل شده",
+    "جاافتاده": "جا افتاده",
+    "جمع‌آوریکننده": "جمع‌ آوری کننده",
+    "جورآدمیه": "جور آدمیه",
+    "حقالزحمه": "حق الزحمه",
+    "دخترونهتره": "دخترونه تره",
+    "دوپنجره": "دو پنجره",
+    "ذاتالریه": "ذات‌الریه",
+    "راسالخیمه": "راس‌الخیمه",
+    "رنگماده": "رنگ ماده",
+    "سوئاستفاده": "سو استفاده",
+    "سواستفاده": "سو استفاده",
+    "شبهجزیره": "شبه جزیره",
+    "صادرکننده": "صادر کننده",
+    "ضررداره": "ضرر داره",
+    "عابرپیاده": "عابر پیاده",
+    "فوقالعاده": "فوق‌العاده",
+    "قابلتوجه": "قابل توجه",
+    "قانع‌کننده": "قانع‌ کننده",
+    "مادربیچاره": "مادر بیچاره",
+    "مشخصشده": "مشخص شده",
+    "مصرفکننده": "مصرف کننده",
+    "مصیبتزده": "مصیب تزده",
+    "ناامیدکننده": "ناامید کننده",
+    "نیمفاصله": "نیم‌فاصله",
+    "هماهنگکننده": "هماهنگ کننده",
+    "همهجانبه": "همه جانبه",
+    "واردکننده": "وارد کننده",
+    "وخوابگاه": "و خوابگاه",
+    "ودستگاه": "و دستگاه",
+    "وزردچوبه": "و زردچوبه",
+    "وپروانه": "و پروانه",
+    "پدرخوانده": "پدر خوانده",
+    "چاپشده": "چاپ شده",
+    "کردته": "کرد ته",
+    "کردندکه": "کردند که",
+    "یکطرفه": "یک طرفه",
+    "پایینتره": "پایین‌تره",
+    "اشتراکگذاری": "اشتراک گذاری",
+    "انحصارگراناند": "انحصار گران‌اند",
+    "خوشحالییییی": "خوشحالی",
+    "همتیمی‌هایشان": "هم تیمی‌هایشان",
+    "پایدار‌ام‌باید": "پایدار‌ام ‌باید",
+    "پرجنبوجوش‌تر": "پر جنب و جوش‌تر",
+    "آبمروارید": "آب مروارید",
+    "آتشسوزی": "آتش سوزی",
+    "آتشنشانی": "آتش‌نشانی",
+    "آتشنشان": "آتش‌نشان",
+    "آرامشبخش": "آرامش بخش",
+    "آشناداشتن": "آشنا داشتن",
+    "آقاچیزی": "آقا چیزی",
+    "آموخت‌هام": "آموخته‌ام",
+    "آموزششان": "آموزش‌شان",
+    "ازآنجا": "از آنجا",
+    "ازالان": "از الان",
+    "ازاینجا": "از اینجا",
+    "ازجیبش": "از جیبش",
+    "ازدستش": "از دستش",
+    "ازدیوار": "از دیوار",
+    "ازشغلشون": "از شغلشون",
+    "ازوقتی": "از وقتی",
+    "ازکسانی": "از کسانی",
+    "اسباببازی": "اسباب بازی",
+    "اسبسوار": "اسب سوار",
+    "اصیلزاده": "اصیل زاده",
+    "افتادهاید": "افتاده‌اید",
+    "ال‌هام": "الهام",
+    "امااصلا": "اما اصلا",
+    "امااصلابه": "اما اصلا به",
+    "امااین": "اما این",
+    "امابعد": "اما بعد",
+    "امابعدیکی": "اما بعد یکی",
+    "اماجاذبه": "اما جاذبه",
+    "امرارمعاش": "امرار معاش",
+    "امکانپذیر": "امکان پذیر",
+    "انت‌های": "انتهای",
+    "انت‌هایی": "انتهایی",
+    "ایزدبانوی": "ایزد بانوی",
+    "بااینحال": "با اینحال",
+    "باحتمال": "به احتمال",
+    "باحجاب": "با حجاب",
+    "باخنده": "با خنده",
+    "بادوستاش": "با دوستاش",
+    "بارمان": "بار مان",
+    "باز‌تر": "باز ‌تر",
+    "باطعنه": "با طعنه",
+    "بافریاد": "با فریاد",
+    "بارگزاری": "بارگذاری",
+    "بالامنم": "بالا منم",
+    "بگیرمامان": "بگیر مامان",
+    "بیاحترامی": "بی احترامی",
+    "بیادبی": "بی ادبی",
+    "بیاعتنا": "بی اعتنا",
+    "بیدارباش": "بیدار باش",
+    "بیشازحد": "بیش از حد",
+    "بیمسئولیت": "بی مسئولیت",
+    "تاسفبار": "تاسف بار",
+    "تامشکلمون": "تا مشکلمون",
+    "تانقشه": "تا نقشه",
+    "تصمیمگیری": "تصمیم گیری",
+    "تقسیمبندی": "تقسیم بندی",
+    "تقصیرارو": "تقصیرا رو",
+    "جدیدابرای": "جدیدا برای",
+    "جعبهابزار": "جعبه ابزار",
+    "جلوتونو": "جلو تو نو",
+    "حاضردر": "حاضر در",
+    "حاضرنیست": "حاضر نیست",
+    "دستنخورده": "دست نخورده",
+    "دوامتیاز": "دو امتیاز",
+    "دوروزتمام": "دو روز تمام",
+    "شخصیسازی": "شخصی‌سازی",
+    "شدواجناس": "شد و اجناس",
+    "شوهردارم": "شوهر دارم",
+    "شوهرشماهم": "شوهر شما هم",
+    "شوهرمحترم": "شوهر محترم",
+    "شکلگیری": "شکل گیری",
+    "صخرهنوردی": "صخره‌نوردی",
+    "صدوبیست": "صد و بیست",
+    "عقبنشینی": "عقب نشینی",
+    "عکسالعمل": "عکس‌العمل",
+    "غرغرمیکنم": "غرغر میکنم",
+    "هزاربار": "هزار بار",
+    "هزارتومان": "هزار تومان",
+    "هزارجور": "هزار جور",
+    "هزاروسیصد": "هزار و سیصد",
+    "هممیهنان": "هم میهنان",
+    "هممیهنانش": "هم میهنانش",
+    "همنسلانش": "هم نسلانش",
+    "همهگیری": "همه گیری",
+    "هییییچ": "هیچ",
+    "وقتاخیلی": "وقتا خیلی",
+    "وقتابه": "وقتا به",
+    "وقتگذرانی": "وقت گذرانی",
+    "ومحکوم": "و محکوم",
+    "ومحیط‌ها": "و محیط‌ها",
+    "وکشورتان": "و کشورتان",
+    "ویکیمدیا": "ویکی‌مدی��",
+    "یهوگفت": "یهو گفت",
+    "اینجااز": "اینجا از",
+}
+fixator_dictionary = {
+    "ب‌های": "بهای",
+    "به‌ترین": "بهترین",
+    "آس‌تر": "‌آستر",
+    "ارکس‌تر": "ارکستر",
+    "ان‌تر": "انتر",
+    "بس‌تر": "بستر",
+    "به‌تر": "بهتر",
+    "به‌ترتر": "بهترتر",
+    "توئی‌تر": "تویتتر",
+    "تویی‌تر": "توییتر",
+    "تی‌تر": "تیتر",
+    "دخ‌تر": "دختر",
+    "دف‌تر": "دفتر",
+    "دلس‌تر": "دلستر",
+    "دک‌تر": "دکتر",
+    "ش‌تر": "شتر",
+    "لی‌تر": "لیتر",
+    "م‌تر": "متر",
+    "هیپس‌تر": "هیپستر",
+    "پی‌تر": "پیتر",
+    "چ‌تر": "چتر",
+    "کم‌تر": "کمتر",
+    "گنگس‌تر": "گنگستر",
+    "انگش‌تر": "انگشتر",
+    "سن‌تر": "سنتر",
+    "تویت‌تر": "توییتر",
+    "مادهش‌تر": "ماده شتر",
+    "وی‌ترین": "ویترین",
+    "کرونوم‌تر": "کرنومتر",
+    "که‌تر": "کهتر",
+    "فیل‌تر": "فیلتر",
+    "ال‌هام": "الهام",
+    "آل‌مان": "آلمان",
+    "انت‌های": "انتهای",
+    "انت‌هایی": "انتهایی",
+    "آموخت‌هام": "آموخته‌ام",
+}

src/normalizer.py ADDED Viewed

	@@ -0,0 +1,227 @@

+from parsivar import Normalizer
+from parsivar import SpellCheck
+import num2fawords
+import re
+import string
+from dictionary import dictionary_mapping, fixator_dictionary
+_normalizer = Normalizer(half_space_char="\u200c", statistical_space_correction=True)
+_spell = SpellCheck()
+chars_to_ignore = [
+    ",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�",
+    "#", "!", "؟", "?", "«", "»", "،", "(", ")", "؛", "'ٔ", "٬", 'ٔ', ",", "?",
+    ".", "!", "-", ";", ":", '"', "“", "%", "‘", "”", "�", "–", "…", "_", "”", '“', '„',
+    'ā', 'š', 'ّ', 'ْ',
+]
+chars_to_ignore = chars_to_ignore + list(string.ascii_lowercase + string.digits)
+chars_to_ignore = f"""[{"".join(chars_to_ignore)}]"""
+zwnj = "\u200c"
+silent_chars = ["ا", "د", "ذ", "ر", "ز", "و", "آ"] + [zwnj] + [" "]
+def multiple_replace(text, chars_to_mapping):
+    pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
+    return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))
+def remove_special_characters(text, chars_to_ignore_regex):
+    text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
+    return text
+def convert_word_nums_to_text(word):
+    try:
+        word = int(word)
+        word = num2fawords.words(word)
+    except:
+        word = word
+    return word
+def normalizer_at_word_level(text):
+    words = text.split()
+    _text = []
+    for word in words:
+        word = convert_word_nums_to_text(word)
+        word = fixator_dictionary.get(word, word)
+        _text.append(word)
+    return " ".join(_text) + " "
+def finder(ss, s, starter=False):
+    found = []
+    for m in re.finditer(ss, s):
+        if starter:
+            found.append(m.start())
+        else:
+            found.append((m.start(), m.end()))
+    return found
+def substring_replace(ss, s, start, end, stripped=True):
+    s_start = s[:start]
+    s_end = s[end:]
+    counter = 0
+    if stripped:
+        counter = 1 if s_start.endswith(" ") else counter
+        s_start = s_start.rstrip()
+    return s_start + ss + s_end, counter
+def normalizer(
+        batch,
+        is_normalize=True,
+        is_spell_check=False,
+        return_dict=True,
+        filter_trivials=False,
+        remove_extra_space=False
+):
+    text = batch["sentence"].lower().strip()
+    # Parsivar normalizer
+    if is_normalize:
+        text = _normalizer.normalize(text)
+    # Dictionary mapping
+    text = multiple_replace(text, dictionary_mapping)
+    text = re.sub(" +", " ", text)
+    # Remove specials
+    text = remove_special_characters(text, chars_to_ignore)
+    text = re.sub(" +", " ", text)
+    # Replace connected آ
+    special, pointer = "آ", int("0")
+    for f in sorted(finder(special, text, True)):
+        index = f + pointer - 1
+        if len(text) >= index:
+            if text[index] not in silent_chars:
+                new_text, extra_pointer = substring_replace(
+                    f"{text[index]}{zwnj}", text, index, index + 1, stripped=True)
+                text = new_text
+                pointer += 1 + 1 - 1 - extra_pointer
+    # Replace connected ها
+    pointer = int("0")
+    special_list = [
+        # "ام", "ای", "است", "ایم", "اید", "اند",
+        "هایمان", "هایم", "هایت", "هایش",
+        "هایتان", "هایشان", "هام", "هات",
+        "هاتان", "هامون", "هامان", "هاش",
+        "هاتون", "هاشان", "هاشون",
+        "هایی", "های", "هاس", "ها"
+    ]
+    for special in special_list:
+        pointer = 0
+        text = text
+        for f in sorted(finder(special, text, False)):
+            start, end = f[0] + pointer - 1, f[1] + pointer - 1
+            if len(text) >= (end + 1):
+                if len(text) == (end + 1):
+                    new_text, extra_pointer = substring_replace(
+                        f"{zwnj}{special}",
+                        text,
+                        start + 1,
+                        end + 1,
+                        stripped=True)
+                    text = new_text
+                    pointer += 1 + 1 - 1 - extra_pointer
+                else:
+                    if text[end + 1] == " ":
+                        new_text, extra_pointer = substring_replace(
+                            f"{zwnj}{special}",
+                            text,
+                            start + 1,
+                            end + 1,
+                            stripped=True)
+                        text = new_text
+                        pointer += 1 + 1 - 1 - extra_pointer
+    special, pointer = "افزار", int("0")
+    for f in sorted(finder(special, text, False)):
+        start, end = f[0] + pointer - 1, f[1] + pointer - 1
+        if len(text) >= (end + 1):
+            new_text, extra_pointer = substring_replace(f"{zwnj}{special}", text, start + 1, end + 1, stripped=True)
+            text = new_text
+            pointer += 1 + 1 - 1 - extra_pointer
+    # Replace connected ها
+    pointer = int("0")
+    special_list = [
+        "ترین", "تر"
+    ]
+    for special in special_list:
+        pointer = 0
+        text = text
+        for f in sorted(finder(special, text, False)):
+            start, end = f[0] + pointer - 1, f[1] + pointer - 1
+            if len(text) >= (end + 1):
+                if len(text) == (end + 1):
+                    new_text, extra_pointer = substring_replace(
+                        f"{zwnj}{special}",
+                        text,
+                        start + 1,
+                        end + 1,
+                        stripped=True)
+                    text = new_text
+                    pointer += 1 + 1 - 1 - extra_pointer
+                else:
+                    if text[end + 1] == " ":
+                        new_text, extra_pointer = substring_replace(
+                            f"{zwnj}{special}",
+                            text,
+                            start + 1,
+                            end + 1,
+                            stripped=True)
+                        text = new_text
+                        pointer += 1 + 1 - 1 - extra_pointer
+    # Parsivar spell correction
+    if is_spell_check:
+        text = _normalizer.normalize(_spell.spell_corrector(text))
+    # Normalizer at word level
+    text = normalizer_at_word_level(text)
+    text = re.sub(" +", " ", text)
+    if remove_extra_space:
+        text = text.strip()
+    else:
+        text = text.strip() + " "
+    if filter_trivials:
+        if not len(text) > 2:
+            text = None
+    if not return_dict:
+        return text
+    batch["sentence"] = text
+    return batch
+if __name__ == '__main__':
+    input_text = "سلام بر شما که میآیید و میآموزید که بیآرآیم"
+    print(normalizer({"sentence": input_text}, return_dict=False))
+    input_text = "کتابهایمان میدانی کجاها ماههاس که کیهامون و کیهان دنبالههاشون برای بهای هستند."
+    print(normalizer({"sentence": input_text}, return_dict=False))
+    input_text = " میانافزارهای امروزی نرمافزار سخت افزار امروز نوشتافزار ها"
+    print(normalizer({"sentence": input_text}, return_dict=False))
+    input_text = "این کتاب بهترین در نوع شتر آسانتر هست"
+    print(normalizer({"sentence": input_text}, return_dict=False))
+    input_text = "سه چیز هست که از پژوهش در این زمینه آموختهام"
+    print(normalizer({"sentence": input_text}, return_dict=False))

src/requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+num2fawords
+parsivar
+tensorboard

src/run_config.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import ast
+import logging
+import os
+import sys
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Tuple
+from transformers import (
+    HfArgumentParser,
+    Wav2Vec2Config,
+    Wav2Vec2FeatureExtractor
+)
+logger = logging.getLogger(__name__)
+@dataclass
+class ConfigArguments:
+    """
+    Arguments to which config we are going to set up.
+    """
+    output_dir: str = field(
+        default=".",
+        metadata={"help": "The output directory where the config will be written."},
+    )
+    name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The model checkpoint for weights initialization."
+                    "Don't set if you want to train a model from scratch."
+        },
+    )
+    config_params: Optional[str] = field(
+        default=None,
+        metadata={"help": "Custom configuration for the specific `name_or_path`"}
+    )
+    feature_extractor_params: Optional[str] = field(
+        default=None,
+        metadata={"help": "Custom feature extractor configuration for the specific `name_or_path`"}
+    )
+    def __post_init__(self):
+        if self.config_params:
+            try:
+                self.config_params = ast.literal_eval(self.config_params)
+            except Exception as e:
+                print(f"Your custom `config` parameters do not acceptable due to {e}")
+        if self.feature_extractor_params:
+            try:
+                self.feature_extractor_params = ast.literal_eval(self.feature_extractor_params)
+            except Exception as e:
+                print(f"Your custom `feature_extractor` parameters do not acceptable due to {e}")
+def main():
+    parser = HfArgumentParser([ConfigArguments])
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        config_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))[0]
+    else:
+        config_args = parser.parse_args_into_dataclasses()[0]
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    logger.setLevel(logging.INFO)
+    logger.info(f"Setting up configuration {config_args.name_or_path} with extra params {config_args.config_params}")
+    if config_args.config_params and isinstance(config_args.config_params, dict):
+        config = Wav2Vec2Config.from_pretrained(
+            config_args.name_or_path,
+            **config_args.config_params
+        )
+    else:
+        config = Wav2Vec2Config.from_pretrained(
+            config_args.name_or_path,
+            mask_time_length=10,
+            mask_time_prob=0.05,
+            diversity_loss_weight=0.1,
+            num_negatives=100,
+            do_stable_layer_norm=True,
+            feat_extract_norm="layer",
+            vocab_size=40
+        )
+    logger.info(f"Setting up feature_extractor {config_args.name_or_path} with extra params "
+                f"{config_args.feature_extractor_params}")
+    if config_args.feature_extractor_params and isinstance(config_args.feature_extractor_params, dict):
+        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
+            config_args.name_or_path,
+            **config_args.feature_extractor_params
+        )
+    else:
+        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
+            config_args.name_or_path,
+            return_attention_mask=True
+        )
+    logger.info(f"Your `config` saved here {config_args.output_dir}/config.json")
+    config.save_pretrained(config_args.output_dir)
+    logger.info(f"Your `feature_extractor` saved here {config_args.output_dir}/preprocessor_config.json")
+    feature_extractor.save_pretrained(config_args.output_dir)
+if __name__ == '__main__':
+    main()

src/run_persian.sh ADDED Viewed

	@@ -0,0 +1,51 @@

+#!/bin/bash
+export LC_ALL=C.UTF-8
+export LANG=C.UTF-8
+export OUTPUT_DIR=/home/m3hrdadfi/code/wav2vec2-base-persian
+export MODEL_NAME_OR_PATH=/home/m3hrdadfi/code/wav2vec2-base-persian
+export TRAIN_FILE=/home/m3hrdadfi/code/data/train.csv
+export VALIDATION_FILE=/home/m3hrdadfi/code/data/test.csv
+export SPEECH_FILE_COLUMN=path
+#export MAX_EVAL_SAMPLES=5000
+export PER_DEVICE_TRAIN_BATCH_SIZE=32
+export PER_DEVICE_EVAL_BATCH_SIZE=32
+#export GRADIENT_ACCUMULATION_STEPS=2
+export NUM_TRAIN_EPOCHS=5.0
+export LEARNING_RATE=5e-4
+export WARMUP_STEPS=1000
+#export LOGGING_STEPS=500
+#export EVAL_STEPS=2500
+#export SAVE_STEPS=2500
+export PREPROCESSING_NUM_WORKERS=4
+export MAX_DURATION_IN_SECONDS=20.0
+export ADAM_BETA_1=0.9
+export ADAM_BETA_2=0.98
+export WEIGHT_DECAY=0.01
+export D_TYPE=bfloat16
+export PAD_TO_MULTIPLE_OF=16384
+python src/run_wav2vec2_pretrain_flax.py \
+    --output_dir="$OUTPUT_DIR"  \
+    --train_file="$TRAIN_FILE" \
+    --validation_file="$VALIDATION_FILE" \
+    --speech_file_column="$SPEECH_FILE_COLUMN" \
+    --model_name_or_path="$MODEL_NAME_OR_PATH"  \
+    --per_device_train_batch_size=$PER_DEVICE_TRAIN_BATCH_SIZE \
+    --per_device_eval_batch_size=$PER_DEVICE_EVAL_BATCH_SIZE \
+    --preprocessing_num_workers=$PREPROCESSING_NUM_WORKERS \
+    --max_duration_in_seconds=$MAX_DURATION_IN_SECONDS \
+    --num_train_epochs=$NUM_TRAIN_EPOCHS \
+    --learning_rate=$LEARNING_RATE \
+    --warmup_steps=$WARMUP_STEPS \
+    --weight_decay=$WEIGHT_DECAY \
+    --adam_beta1=$ADAM_BETA_1 \
+    --adam_beta2=$ADAM_BETA_2 \
+    --dtype="$D_TYPE" \
+    --pad_to_multiple_of=$PAD_TO_MULTIPLE_OF \
+    --push_to_hub

src/run_wav2vec2_pretrain_flax.py ADDED Viewed

	@@ -0,0 +1,638 @@

+import logging
+import sys
+import time
+from dataclasses import field
+from pathlib import Path
+from typing import Dict, List, Optional, Union
+# !/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Training the library models for Wav2Vec.
+"""
+# You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments.
+import numpy as np
+from datasets import DatasetDict, load_dataset
+from tqdm import tqdm
+import flax
+import jax
+import jax.numpy as jnp
+import librosa
+import optax
+from flax import jax_utils, traverse_util
+from flax.training import train_state
+from flax.training.common_utils import get_metrics, onehot, shard
+from transformers import (
+    FlaxWav2Vec2ForPreTraining,
+    HfArgumentParser,
+    TrainingArguments,
+    Wav2Vec2Config,
+    Wav2Vec2FeatureExtractor,
+    is_tensorboard_available,
+)
+from transformers.models.wav2vec2.modeling_flax_wav2vec2 import _compute_mask_indices, _sample_negative_indices
+from normalizer import normalizer
+logger = logging.getLogger(__name__)
+@flax.struct.dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
+    )
+    freeze_feature_extractor: Optional[bool] = field(
+        default=True, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
+    )
+    gradient_checkpointing: Optional[bool] = field(
+        default=False, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
+    )
+    verbose_logging: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Whether to log verbose messages or not."},
+    )
+    max_gumbel_temperature: Optional[float] = field(
+        default=2.0, metadata={"help": "Maximum temperature for gumbel softmax."}
+    )
+    min_gumbel_temperature: Optional[float] = field(
+        default=0.1, metadata={"help": "Minimum temperature for gumbel softmax."}
+    )
+    gumbel_temperature_decay: Optional[float] = field(
+        default=0.999995, metadata={"help": "Decay of gumbel temperature during training."}
+    )
+    dtype: Optional[str] = field(
+        default="float32",
+        metadata={
+            "help": "Floating-point format in which the model weights should be initialized and trained. Choose one of `[float32, float16, bfloat16]`."
+        },
+    )
+@flax.struct.dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    Using `HfArgumentParser` we can turn this class
+    into argparse arguments to be able to specify them on
+    the command line.
+    """
+    dataset_name: str = field(
+        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_split_name: Optional[str] = field(
+        default="train",
+        metadata={
+            "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
+        },
+    )
+    validation_split_name: Optional[str] = field(
+        default="validation",
+        metadata={
+            "help": "The name of the validation data set split to use (via the datasets library). Defaults to 'validation'"
+        },
+    )
+    train_file: Optional[str] = field(
+        default=None, metadata={"help": "The input training data file (a csv or JSON file)."}
+    )
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input evaluation data file to evaluate on (a csv or JSON file)."},
+    )
+    speech_file_column: Optional[str] = field(
+        default="file",
+        metadata={"help": "Column in the dataset that contains speech file path. Defaults to 'file'"},
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
+    )
+    validation_split_percentage: Optional[int] = field(
+        default=5,
+        metadata={
+            "help": "The percentage of the train set used as validation set in case there's no validation split"
+        },
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_duration_in_seconds: Optional[float] = field(
+        default=20.0, metadata={"help": "Filter audio files that are longer than `max_duration_in_seconds` seconds"}
+    )
+    pad_to_multiple_of: Optional[int] = field(
+        default=1024,
+        metadata={
+            "help": "If set will pad the sequence to a multiple of the provided value. This is important to avoid triggering recompilations on TPU"
+        },
+    )
+@flax.struct.dataclass
+class FlaxDataCollatorForWav2Vec2Pretraining:
+    """
+    Data collator that will dynamically pad the inputs received and prepare masked indices
+    for self-supervised pretraining.
+    Args:
+        model (:class:`~transformers.FlaxWav2Vec2ForPreTraining`):
+            The Wav2Vec2 model used for pretraining. The data collator needs to have access
+            to config and ``_get_feat_extract_output_lengths`` function for correct padding.
+        feature_extractor (:class:`~transformers.Wav2Vec2FeatureExtractor`):
+            The processor used for proccessing the data.
+        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
+            among:
+            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+              sequence if provided).
+            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+              maximum acceptable input length for the model if that argument is not provided.
+            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+              different lengths).
+        max_length (:obj:`int`, `optional`):
+            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
+        pad_to_multiple_of (:obj:`int`, `optional`):
+            If set will pad the sequence to a multiple of the provided value.
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+            7.5 (Volta).
+    """
+    model: FlaxWav2Vec2ForPreTraining
+    feature_extractor: Wav2Vec2FeatureExtractor
+    padding: Union[bool, str] = "longest"
+    pad_to_multiple_of: Optional[int] = None
+    max_length: Optional[int] = None
+    def __call__(self, features: List[Dict[str, Union[List[int], np.ndarray]]]) -> Dict[str, np.ndarray]:
+        # reformat list to dict and set to pytorch format
+        batch = self.feature_extractor.pad(
+            features,
+            max_length=self.max_length,
+            padding=self.padding,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors="np",
+        )
+        mask_indices_seq_length = self.model._get_feat_extract_output_lengths(batch["input_values"].shape[-1])
+        # sample randomly masked indices
+        batch["mask_time_indices"] = _compute_mask_indices(
+            (batch["input_values"].shape[0], mask_indices_seq_length),
+            self.model.config.mask_time_prob,
+            self.model.config.mask_time_length,
+            min_masks=2,
+        )
+        # sample indices to take for negative vectors
+        batch["sampled_negative_indices"] = _sample_negative_indices(
+            (batch["mask_time_indices"].shape + (self.model.config.proj_codevector_dim,)),
+            self.model.config.num_negatives,
+        )
+        return batch
+def configure_logger(model_args: ModelArguments, training_args: TrainingArguments):
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    logging_level = logging.WARNING
+    if model_args.verbose_logging:
+        logging_level = logging.DEBUG
+    logger.setLevel(logging_level)
+def write_train_metric(summary_writer, train_metrics, train_time, step):
+    summary_writer.scalar("train_time", train_time, step)
+    train_metrics = get_metrics(train_metrics)
+    for key, vals in train_metrics.items():
+        tag = f"train_{key}"
+        for i, val in enumerate(vals):
+            summary_writer.scalar(tag, val, step - len(vals) + i + 1)
+def write_eval_metric(summary_writer, eval_metrics, step):
+    for metric_name, value in eval_metrics.items():
+        summary_writer.scalar(f"eval_{metric_name}", value, step)
+def generate_batch_splits(samples_idx: jnp.ndarray, batch_size: int) -> jnp.ndarray:
+    num_samples = len(samples_idx)
+    samples_to_remove = num_samples % batch_size
+    if samples_to_remove != 0:
+        samples_idx = samples_idx[:-samples_to_remove]
+    sections_split = num_samples // batch_size
+    batch_idx = np.split(samples_idx, sections_split)
+    return batch_idx
+def compute_contrastive_loss(
+        quantized_features, transformer_features, negative_indices, mask_time_indices, logits_temp, num_negatives
+):
+    batch_size, sequence_length, hidden_size = quantized_features.shape
+    # take negative vectors from sampled indices
+    quantized_negatives = quantized_features.reshape(-1, hidden_size)[negative_indices.reshape(-1)]
+    quantized_negatives = quantized_negatives.reshape(
+        batch_size, sequence_length, num_negatives, hidden_size
+    ).transpose(2, 0, 1, 3)
+    target_features = jnp.concatenate([quantized_features[None, :], quantized_negatives], axis=0)
+    loss_logits = optax.cosine_similarity(transformer_features, target_features)
+    loss_logits = loss_logits / logits_temp
+    neg_is_pos = (quantized_features == quantized_negatives).all(-1)
+    neg_is_pos = jnp.concatenate([jnp.full((1,) + loss_logits.shape[1:], False), neg_is_pos], axis=0)
+    # make sure incorrectly sampled vectors don't contribute to loss
+    loss_logits = jnp.where(neg_is_pos, -1e9, loss_logits)
+    predictions = loss_logits.transpose(2, 1, 0).reshape(-1, loss_logits.shape[0])
+    targets = ((1 - mask_time_indices) * -100).transpose(1, 0).flatten()
+    target_mask = jnp.where(targets >= 0, 1.0, 0.0)
+    contrastive_loss = optax.softmax_cross_entropy(predictions, onehot(targets, predictions.shape[-1])) * target_mask
+    contrastive_loss = contrastive_loss.sum()
+    return contrastive_loss
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    configure_logger(model_args, training_args)
+    # Downloading and loading a dataset from the hub.
+    if data_args.dataset_name:
+        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
+        if "validation" not in datasets.keys():
+            # make sure only "validation" and "train" keys remain"
+            datasets = DatasetDict()
+            datasets["validation"] = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                split=f"{data_args.train_split_name}[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
+            )
+            datasets["train"] = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                split=f"{data_args.train_split_name}[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
+            )
+        else:
+            # make sure only "validation" and "train" keys remain"
+            datasets = DatasetDict()
+            datasets["validation"] = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                split="validation",
+                cache_dir=model_args.cache_dir,
+            )
+            datasets["train"] = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                split=f"{data_args.train_split_name}",
+                cache_dir=model_args.cache_dir,
+            )
+    else:
+        data_files = {}
+        if data_args.train_file is not None:
+            data_files["train"] = data_args.train_file
+        if data_args.validation_file is not None:
+            data_files["validation"] = data_args.validation_file
+        extension = data_args.train_file.split(".")[-1]
+        datasets = load_dataset(extension, data_files=data_files, delimiter="\t")
+    # only normalized-inputs-training is supported
+    feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        do_normalize=True
+    )
+    def prepare_dataset(batch):
+        # check that all files have the correct sampling rate
+        batch["speech"], _ = librosa.load(batch[data_args.speech_file_column], sr=feature_extractor.sampling_rate)
+        return batch
+    # load audio files into numpy arrays
+    vectorized_datasets = datasets.map(
+        prepare_dataset,
+        num_proc=data_args.preprocessing_num_workers,
+        remove_columns=datasets["train"].column_names
+    )
+    # filter audio files that are too long
+    vectorized_datasets = vectorized_datasets.filter(
+        lambda data: len(data["speech"]) < int(data_args.max_duration_in_seconds * feature_extractor.sampling_rate)
+    )
+    def normalize(batch):
+        return feature_extractor(batch["speech"], sampling_rate=feature_extractor.sampling_rate)
+    # normalize and transform to `BatchFeatures`
+    vectorized_datasets = vectorized_datasets.map(
+        normalize,
+        batched=True,
+        num_proc=data_args.preprocessing_num_workers,
+        load_from_cache_file=not data_args.overwrite_cache,
+        remove_columns=vectorized_datasets["train"].column_names,
+    )
+    # pretraining is only supported for "newer" stable layer norm architecture
+    # apply_spec_augment has to be True, mask_feature_prob has to be 0.0
+    config = Wav2Vec2Config.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        gradient_checkpointing=model_args.gradient_checkpointing,
+    )
+    if not config.do_stable_layer_norm or config.feat_extract_norm != "layer":
+        raise ValueError(
+            "PreTraining is only supported for ``config.do_stable_layer_norm=True`` and ``config.feat_extract_norm='layer'"
+        )
+    model = FlaxWav2Vec2ForPreTraining(
+        config,
+        seed=training_args.seed,
+        dtype=getattr(jnp, model_args.dtype)
+    )
+    data_collator = FlaxDataCollatorForWav2Vec2Pretraining(
+        model=model,
+        feature_extractor=feature_extractor,
+        pad_to_multiple_of=data_args.pad_to_multiple_of
+    )
+    # Enable tensorboard only on the master node
+    has_tensorboard = is_tensorboard_available()
+    if has_tensorboard and jax.process_index() == 0:
+        try:
+            from flax.metrics.tensorboard import SummaryWriter
+            summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir))
+        except ImportError as ie:
+            has_tensorboard = False
+            logger.warning(
+                f"Unable to display metrics through TensorBoard because some package are not installed: {ie}"
+            )
+    else:
+        logger.warning(
+            "Unable to display metrics through TensorBoard because the package is not installed: "
+            "Please run pip install tensorboard to enable."
+        )
+    # Initialize our training
+    rng = jax.random.PRNGKey(training_args.seed)
+    dropout_rngs = jax.random.split(rng, jax.local_device_count())
+    gumbel_rngs = jax.random.split(rng, jax.local_device_count())
+    num_epochs = int(training_args.num_train_epochs)
+    train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
+    eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
+    num_train_steps = len(vectorized_datasets["train"]) // train_batch_size * num_epochs
+    # Create learning rate schedule
+    warmup_fn = optax.linear_schedule(
+        init_value=0.0, end_value=training_args.learning_rate, transition_steps=training_args.warmup_steps
+    )
+    decay_fn = optax.linear_schedule(
+        init_value=training_args.learning_rate,
+        end_value=0,
+        transition_steps=num_train_steps - training_args.warmup_steps,
+    )
+    linear_decay_lr_schedule_fn = optax.join_schedules(
+        schedules=[warmup_fn, decay_fn], boundaries=[training_args.warmup_steps]
+    )
+    # We use Optax's "masking" functionality to not apply weight decay
+    # to bias and LayerNorm scale parameters. decay_mask_fn returns a
+    # mask boolean with the same structure as the parameters.
+    # The mask is True for parameters that should be decayed.
+    def decay_mask_fn(params):
+        flat_params = traverse_util.flatten_dict(params)
+        flat_mask = {
+            path: (path[-1] != "bias" and path[-2:] not in [("layer_norm", "scale"), ("final_layer_norm", "scale")])
+            for path in flat_params
+        }
+        return traverse_util.unflatten_dict(flat_mask)
+    # create adam optimizer
+    adamw = optax.adamw(
+        learning_rate=linear_decay_lr_schedule_fn,
+        b1=training_args.adam_beta1,
+        b2=training_args.adam_beta2,
+        eps=training_args.adam_epsilon,
+        weight_decay=training_args.weight_decay,
+        mask=decay_mask_fn,
+    )
+    # Setup train state and define training hyper-parameters
+    state = train_state.TrainState.create(apply_fn=model.__call__, params=model.params, tx=adamw)
+    num_negatives = model.config.num_negatives
+    contrastive_logits_temperature = model.config.contrastive_logits_temperature
+    num_codevectors = model.config.num_codevectors_per_group * model.config.num_codevector_groups
+    diversity_loss_weight = model.config.diversity_loss_weight
+    # Define gradient update step fn
+    def train_step(state, batch, dropout_rng, gumbel_rng):
+        dropout_rng, new_dropout_rng = jax.random.split(dropout_rng)
+        gumbel_rng, new_gumbel_rng = jax.random.split(gumbel_rng)
+        def loss_fn(params):
+            negative_indices = batch.pop("sampled_negative_indices")
+            gumbel_temperature = jnp.clip(
+                model_args.max_gumbel_temperature * model_args.gumbel_temperature_decay ** state.step,
+                a_min=model_args.min_gumbel_temperature,
+            )
+            outputs = state.apply_fn(
+                **batch,
+                gumbel_temperature=gumbel_temperature,
+                params=params,
+                dropout_rng=dropout_rng,
+                gumbel_rng=gumbel_rng,
+                train=True,
+            )
+            contrastive_loss = compute_contrastive_loss(
+                outputs.projected_quantized_states,
+                outputs.projected_states,
+                negative_indices,
+                batch["mask_time_indices"],
+                contrastive_logits_temperature,
+                num_negatives,
+            )
+            diversity_loss = (num_codevectors - outputs.codevector_perplexity) / num_codevectors
+            loss = contrastive_loss + diversity_loss_weight * diversity_loss
+            return loss
+        grad_fn = jax.value_and_grad(loss_fn)
+        loss, grad = grad_fn(state.params)
+        grad = jax.lax.pmean(grad, "batch")
+        new_state = state.apply_gradients(grads=grad)
+        metrics = jax.lax.pmean(
+            {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step)}, axis_name="batch"
+        )
+        return new_state, metrics, new_dropout_rng, new_gumbel_rng
+    # Create parallel version of the train step
+    p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0,))
+    # Define eval fn
+    def eval_step(params, batch):
+        negative_indices = batch.pop("sampled_negative_indices")
+        outputs = model(**batch, params=params, train=False)
+        contrastive_loss = compute_contrastive_loss(
+            outputs.projected_quantized_states,
+            outputs.projected_states,
+            negative_indices,
+            batch["mask_time_indices"],
+            contrastive_logits_temperature,
+            num_negatives,
+        )
+        diversity_loss = (num_codevectors - outputs.codevector_perplexity) / num_codevectors
+        loss = contrastive_loss + diversity_loss_weight * diversity_loss
+        # summarize metrics
+        metrics = {"loss": loss.mean(), "codevector_perplexity": outputs.codevector_perplexity}
+        metrics = jax.lax.pmean(metrics, axis_name="batch")
+        return metrics
+    p_eval_step = jax.pmap(eval_step, "batch", donate_argnums=(0,))
+    # Replicate the train state on each device
+    state = jax_utils.replicate(state)
+    train_time = 0
+    train_metrics = []
+    epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
+    for epoch in epochs:
+        # ======================== Training ================================
+        train_start = time.time()
+        # Create sampling rng
+        rng, input_rng = jax.random.split(rng)
+        # Generate an epoch by shuffling sampling indices from the train dataset
+        num_train_samples = len(vectorized_datasets["train"])
+        train_samples_idx = jax.random.permutation(input_rng, jnp.arange(num_train_samples))
+        train_batch_idx = generate_batch_splits(train_samples_idx, train_batch_size)
+        # Gather the indexes for creating the batch and do a training step
+        for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1)):
+            samples = [vectorized_datasets["train"][int(idx)] for idx in batch_idx]
+            model_inputs = data_collator(samples)
+            model_inputs = shard(model_inputs.data)
+            # Model forward
+            state, train_metric, dropout_rngs, gumbel_rngs = p_train_step(
+                state, model_inputs, dropout_rngs, gumbel_rngs
+            )
+            train_metrics.append(train_metric)
+            cur_step = epoch * (num_train_samples // train_batch_size) + step
+            if cur_step % training_args.logging_steps == 0 and cur_step > 0:
+                # Save metrics
+                train_metric = jax_utils.unreplicate(train_metric)
+                train_time += time.time() - train_start
+                if has_tensorboard and jax.process_index() == 0:
+                    write_train_metric(summary_writer, train_metrics, train_time, cur_step)
+                epochs.write(
+                    f"Step... ({cur_step} | Loss: {train_metric['loss'].mean()}, Learning Rate: {train_metric['learning_rate'].mean()})"
+                )
+                train_metrics = []
+        # ======================== Evaluating ==============================
+        num_eval_samples = len(vectorized_datasets["validation"])
+        eval_samples_idx = jnp.arange(num_eval_samples)
+        eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size)
+        eval_metrics = []
+        for i, batch_idx in enumerate(tqdm(eval_batch_idx, desc="Evaluating ...", position=2)):
+            samples = [vectorized_datasets["validation"][int(idx)] for idx in batch_idx]
+            model_inputs = data_collator(samples)
+            # Model forward
+            model_inputs = shard(model_inputs.data)
+            metrics = p_eval_step(state.params, model_inputs)
+            eval_metrics.append(metrics)
+        # get eval metrics
+        eval_metrics = get_metrics(eval_metrics)
+        eval_metrics = jax.tree_map(jnp.mean, eval_metrics)
+        # Update progress bar
+        epochs.write(
+            f"Epoch... ({epoch + 1}/{num_epochs} | Loss: {eval_metrics['loss']}, Perplexity: {eval_metrics['codevector_perplexity']})"
+        )
+        # Save metrics
+        if has_tensorboard and jax.process_index() == 0:
+            cur_step = epoch * (len(vectorized_datasets["train"]) // train_batch_size)
+            write_eval_metric(summary_writer, eval_metrics, cur_step)
+        # save checkpoint after each epoch and push checkpoint to the hub
+        if jax.process_index() == 0:
+            params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
+            model.save_pretrained(
+                training_args.output_dir,
+                params=params,
+                push_to_hub=training_args.push_to_hub
+            )
+if __name__ == "__main__":
+    main()