{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import os\n", "import sys" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# !mkdir -p /home/m3hrdadfi/code/data\n", "# %cd /home/m3hrdadfi/code/data\n", "# !wget https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/fa.tar.gz && tar -xzf fa.tar.gz\n", "# %cd /home/m3hrdadfi/" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/m3hrdadfi/data/fa/cvfa/fa\n", "/home/m3hrdadfi/data/fa\n", "\n", "cvfa fa.tar.gz\n", "/home/m3hrdadfi/data/fa/cvfa/fa/dev.tsv\n", "/home/m3hrdadfi/data/fa/cvfa/fa/invalidated.tsv\n", "/home/m3hrdadfi/data/fa/cvfa/fa/other.tsv\n", "/home/m3hrdadfi/data/fa/cvfa/fa/reported.tsv\n", "/home/m3hrdadfi/data/fa/cvfa/fa/test.tsv\n", "/home/m3hrdadfi/data/fa/cvfa/fa/train.tsv\n", "/home/m3hrdadfi/data/fa/cvfa/fa/validated.tsv\n" ] } ], "source": [ "import os\n", "\n", "lang = \"fa\"\n", "abs_path_to_data = os.path.join(f\"/home/m3hrdadfi/data/{lang}\", f\"cv{lang}\", lang)\n", "save_path = \"/\".join(abs_path_to_data.split('/')[:-2])\n", "print(abs_path_to_data)\n", "print(save_path)\n", "print()\n", "!ls {save_path}\n", "!ls {abs_path_to_data}/*.tsv" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from tqdm import tqdm" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Step 0: 5213\n", "Step 1: 5213\n", "Step 2: 5213\n", "Step 3: 5213\n" ] }, { "data": { "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
sentencepath
0از مهمونداری کنار بکشم/home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
1برو از مهرداد بپرس./home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
2خب ، تو چیكار می كنی؟/home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
3مسقط پایتخت عمان در عربی به معنای محل سقوط است/home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
4آه، نه اصلاُ!/home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
\n
", "text/plain": " sentence \\\n0 از مهمونداری کنار بکشم \n1 برو از مهرداد بپرس. \n2 خب ، تو چیكار می كنی؟ \n3 مسقط پایتخت عمان در عربی به معنای محل سقوط است \n4 آه، نه اصلاُ! \n\n path \n0 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n1 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n2 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n3 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n4 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... " }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_df = pd.read_csv(f\"{abs_path_to_data}/test.tsv\", sep=\"\\t\")\n", "\n", "print(f\"Step 0: {len(test_df)}\")\n", "\n", "test_df[\"path\"] = abs_path_to_data + \"/clips/\" + test_df[\"path\"]\n", "test_df[\"status\"] = test_df[\"path\"].apply(lambda path: True if os.path.exists(path) else None)\n", "test_df = test_df.dropna(subset=[\"path\"])\n", "test_df = test_df.drop(\"status\", 1)\n", "print(f\"Step 1: {len(test_df)}\")\n", "\n", "test_df = test_df.dropna(subset=[\"sentence\"])\n", "print(f\"Step 2: {len(test_df)}\")\n", "\n", "test_df = test_df[[\"sentence\", \"path\"]]\n", "test_df = test_df.drop_duplicates(subset=\"path\")\n", "print(f\"Step 3: {len(test_df)}\")\n", "\n", "test_df = test_df.reset_index(drop=True)\n", "test_df.head()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "12806\n", "Step 0: 286975\n", "Step 1: 286975\n", "Step 2: 286975\n", "Step 3: 274169\n" ] }, { "data": { "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
sentencepath
0رآس ، اینجا چه خبره ؟/home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
1ممکن است آن را تعمیر کنید وقتی منتظر هستم؟/home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
2دلم برای تو تنگ شده است./home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
3دارم اتاقم را تمیز میکنم./home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
4هاورد باهاتون صحبت کنم/home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
\n
", "text/plain": " sentence \\\n0 رآس ، اینجا چه خبره ؟ \n1 ممکن است آن را تعمیر کنید وقتی منتظر هستم؟ \n2 دلم برای تو تنگ شده است. \n3 دارم اتاقم را تمیز میکنم. \n4 هاورد باهاتون صحبت کنم \n\n path \n0 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n1 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n2 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n3 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n4 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... " }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "_train_df = pd.concat([\n", " pd.read_csv(f\"{abs_path_to_data}/train.tsv\", sep=\"\\t\"),\n", " pd.read_csv(f\"{abs_path_to_data}/dev.tsv\", sep=\"\\t\"),\n", "])\n", "print(len(_train_df))\n", "\n", "train_df = pd.concat([\n", " pd.read_csv(f\"{abs_path_to_data}/train.tsv\", sep=\"\\t\"),\n", " pd.read_csv(f\"{abs_path_to_data}/dev.tsv\", sep=\"\\t\"),\n", " pd.read_csv(f\"{abs_path_to_data}/validated.tsv\", sep=\"\\t\"),\n", " pd.read_csv(f\"{abs_path_to_data}/other.tsv\", sep=\"\\t\"),\n", "])\n", "print(f\"Step 0: {len(train_df)}\")\n", "\n", "train_df[\"path\"] = abs_path_to_data + \"/clips/\" + train_df[\"path\"]\n", "train_df[\"status\"] = train_df[\"path\"].apply(lambda path: True if os.path.exists(path) else None)\n", "train_df = train_df.dropna(subset=[\"path\"])\n", "train_df = train_df.drop(\"status\", 1)\n", "print(f\"Step 1: {len(train_df)}\")\n", "\n", "train_df = train_df.dropna(subset=[\"sentence\"])\n", "print(f\"Step 2: {len(train_df)}\")\n", "\n", "train_df = train_df[[\"sentence\", \"path\"]]\n", "train_df = train_df.drop_duplicates(subset=\"path\")\n", "print(f\"Step 3: {len(train_df)}\")\n", "\n", "train_df = train_df.sample(frac=1)\n", "train_df = train_df.reset_index(drop=True)\n", "train_df.head()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 5213/5213 [02:58<00:00, 29.27it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Found #5213 test data\n" ] } ], "source": [ "from tqdm import tqdm\n", "\n", "testset_indices = []\n", "\n", "for index, row in tqdm(test_df.iterrows(), total=len(test_df), position=0):\n", " _id = row[\"path\"]\n", " finder = train_df[train_df[\"path\"] == _id]\n", " if len(finder) > 0:\n", " testset_indices.extend(list(finder.index))\n", "\n", "testset_indices = list(set(testset_indices))\n", "print(f\"Found #{len(testset_indices)} test data\")" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "274169\n", "268956\n" ] } ], "source": [ "print(len(train_df))\n", "train_df = train_df.drop(testset_indices)\n", "print(len(train_df))" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Int64Index: 274169 entries, 0 to 5212\n", "Data columns (total 2 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 sentence 274169 non-null object\n", " 1 path 274169 non-null object\n", "dtypes: object(2)\n", "memory usage: 6.3+ MB\n", "None\n", "\n", "RangeIndex: 274169 entries, 0 to 274168\n", "Data columns (total 2 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 sentence 274169 non-null object\n", " 1 path 274169 non-null object\n", "dtypes: object(2)\n", "memory usage: 4.2+ MB\n", "None\n" ] }, { "data": { "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
sentencepath
0رآس ، اینجا چه خبره ؟/home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
1ممکن است آن را تعمیر کنید وقتی منتظر هستم؟/home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
2دلم برای تو تنگ شده است./home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
3دارم اتاقم را تمیز میکنم./home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
4هاورد باهاتون صحبت کنم/home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
\n
", "text/plain": " sentence \\\n0 رآس ، اینجا چه خبره ؟ \n1 ممکن است آن را تعمیر کنید وقتی منتظر هستم؟ \n2 دلم برای تو تنگ شده است. \n3 دارم اتاقم را تمیز میکنم. \n4 هاورد باهاتون صحبت کنم \n\n path \n0 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n1 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n2 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n3 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n4 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... " }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "df = pd.concat([train_df, test_df], axis=0)\n", "print(df.info())\n", "df = df.reset_index(drop=True)\n", "print(df.info())\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "import torchaudio\n", "import librosa\n", "import IPython.display as ipd\n", "import numpy as np\n", "\n", "def load_audio(path):\n", " speech, sr = torchaudio.load(path)\n", " speech = speech[0].numpy().squeeze() \n", " speech = librosa.resample(np.asarray(speech), sr, 16_000)\n", " \n", " print(speech.shape, sr)\n", " \n", " ipd.display(ipd.Audio(data=np.asarray(speech), autoplay=True, rate=16000))" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "# main_vocab = [\"ح\", \"چ\", \"ج\", \"ث\", \"ت\", \"پ\", \"ب\", \"آ\", \"ا\", \"ش\", \"س\", \"ژ\", \"ز\", \"ر\", \"ذ\", \"د\", \"خ\", \"ق\", \"ف\", \"غ\", \"ع\", \"ظ\", \"ط\", \"ض\", \"ص\", \"ی\", \"ه\", \"و\", \"ن\", \"م\", \"ل\", \"گ\", \"ک\"]\n", "# text = \" \".join(df[\"sentence\"].values.tolist())\n", "# vocab = list(sorted(set(text)))\n", "\n", "# for v in main_vocab:\n", "# if v not in vocab:\n", "# print(\"v\", v)\n", "\n", "# print(len(main_vocab), len(vocab))\n", "# print(len(vocab), vocab)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": "sentence می توانید لطفاً سفر را برای من ترتیب دهید؟\npath /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...\nName: 95177, dtype: object" }, "metadata": { "transient": {} }, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "می توانید لطفاً سفر را برای من ترتیب دهید؟\n", "\n", "(70272,) 48000\n" ] }, { "data": { "text/html": "\n \n ", "text/plain": "" }, "metadata": { "transient": {} }, "output_type": "display_data" }, { "data": { "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
sentencepath
0رآس ، اینجا چه خبره ؟/home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
1ممکن است آن را تعمیر کنید وقتی منتظر هستم؟/home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
2دلم برای تو تنگ شده است./home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
3دارم اتاقم را تمیز میکنم./home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
4هاورد باهاتون صحبت کنم/home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
\n
", "text/plain": " sentence \\\n0 رآس ، اینجا چه خبره ؟ \n1 ممکن است آن را تعمیر کنید وقتی منتظر هستم؟ \n2 دلم برای تو تنگ شده است. \n3 دارم اتاقم را تمیز میکنم. \n4 هاورد باهاتون صحبت کنم \n\n path \n0 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n1 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n2 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n3 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n4 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... " }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import numpy as np\n", "\n", "\n", "idx = np.random.randint(0, len(df))\n", "# idx = 6140\n", "sample = df.iloc[idx]\n", "ipd.display(sample)\n", "\n", "print()\n", "print(sample[\"sentence\"])\n", "print()\n", "load_audio(sample[\"path\"])\n", "\n", "train_df.head()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Int64Index: 268956 entries, 0 to 274168\n", "Data columns (total 3 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 sentence 268956 non-null object\n", " 1 path 268956 non-null object\n", " 2 _path 268956 non-null object\n", "dtypes: object(3)\n", "memory usage: 8.2+ MB\n", "None\n", "/home/m3hrdadfi/data/fa/cvfa/fa/clips/common_voice_fa_20100079.mp3\n", "/home/m3hrdadfi/data/fa/clips/common_voice_fa_20100079.mp3\n" ] }, { "data": { "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
sentencepath_path
0رآس ، اینجا چه خبره ؟/home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
1ممکن است آن را تعمیر کنید وقتی منتظر هستم؟/home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
2دلم برای تو تنگ شده است./home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
3دارم اتاقم را تمیز میکنم./home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
4هاورد باهاتون صحبت کنم/home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
\n
", "text/plain": " sentence \\\n0 رآس ، اینجا چه خبره ؟ \n1 ممکن است آن را تعمیر کنید وقتی منتظر هستم؟ \n2 دلم برای تو تنگ شده است. \n3 دارم اتاقم را تمیز میکنم. \n4 هاورد باهاتون صحبت کنم \n\n path \\\n0 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n1 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n2 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n3 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n4 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n\n _path \n0 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n1 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n2 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n3 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n4 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... " }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_train_df = train_df.copy()\n", "new_train_df[\"_path\"] = new_train_df[\"path\"]\n", "new_train_df[\"path\"] = new_train_df[\"path\"].apply(lambda t: os.path.join(\"/home/m3hrdadfi/data/fa/clips\", t.split(\"/\")[-1]))\n", "print(new_train_df.info())\n", "print(new_train_df.iloc[0][\"_path\"])\n", "print(new_train_df.iloc[0][\"path\"])\n", "new_train_df.head()" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 5213 entries, 0 to 5212\n", "Data columns (total 3 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 sentence 5213 non-null object\n", " 1 path 5213 non-null object\n", " 2 _path 5213 non-null object\n", "dtypes: object(3)\n", "memory usage: 122.3+ KB\n", "None\n", "/home/m3hrdadfi/data/fa/cvfa/fa/clips/common_voice_fa_18325365.mp3\n", "/home/m3hrdadfi/data/fa/clips/common_voice_fa_18325365.mp3\n" ] }, { "data": { "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
sentencepath_path
0از مهمونداری کنار بکشم/home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
1برو از مهرداد بپرس./home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
2خب ، تو چیكار می كنی؟/home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
3مسقط پایتخت عمان در عربی به معنای محل سقوط است/home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
4آه، نه اصلاُ!/home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
\n
", "text/plain": " sentence \\\n0 از مهمونداری کنار بکشم \n1 برو از مهرداد بپرس. \n2 خب ، تو چیكار می كنی؟ \n3 مسقط پایتخت عمان در عربی به معنای محل سقوط است \n4 آه، نه اصلاُ! \n\n path \\\n0 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n1 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n2 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n3 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n4 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n\n _path \n0 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n1 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n2 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n3 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n4 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... " }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_test_df = test_df.copy()\n", "new_test_df[\"_path\"] = new_test_df[\"path\"]\n", "new_test_df[\"path\"] = new_test_df[\"path\"].apply(lambda t: os.path.join(\"/home/m3hrdadfi/data/fa/clips\", t.split(\"/\")[-1]))\n", "print(new_test_df.info())\n", "print(new_test_df.iloc[0][\"_path\"])\n", "print(new_test_df.iloc[0][\"path\"])\n", "new_test_df.head()" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "import shutil\n", "from tqdm import tqdm" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/m3hrdadfi/data/fa\n" ] } ], "source": [ "print(save_path)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "!mkdir -p {save_path}/clips\n", "!mkdir -p {save_path}/augs" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 268956/268956 [02:40<00:00, 1675.19it/s]\n" ] } ], "source": [ "for index, row in tqdm(new_train_df.iterrows(), position=0, total=len(new_train_df)):\n", " shutil.copy(row[\"_path\"], row[\"path\"])" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 5213/5213 [00:01<00:00, 4777.79it/s]\n" ] } ], "source": [ "for index, row in tqdm(new_test_df.iterrows(), position=0, total=len(new_test_df)):\n", " shutil.copy(row[\"_path\"], row[\"path\"])" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 26896 entries, 0 to 26895\n", "Data columns (total 3 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 sentence 26896 non-null object\n", " 1 path 26896 non-null object\n", " 2 _path 26896 non-null object\n", "dtypes: object(3)\n", "memory usage: 630.5+ KB\n", "None\n" ] }, { "data": { "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
sentencepath_path
0کدامیک ارزان تر است؟/home/m3hrdadfi/data/fa/augs/common_voice_fa_2.../home/m3hrdadfi/data/fa/clips/common_voice_fa_...
1آیا قرمز را بیشتر از آبی دوست داری؟/home/m3hrdadfi/data/fa/augs/common_voice_fa_2.../home/m3hrdadfi/data/fa/clips/common_voice_fa_...
2من می خوام کمک کنم/home/m3hrdadfi/data/fa/augs/common_voice_fa_1.../home/m3hrdadfi/data/fa/clips/common_voice_fa_...
3در آفریقای جنوبی، برنامهای به نام دختران تکنو هست/home/m3hrdadfi/data/fa/augs/common_voice_fa_1.../home/m3hrdadfi/data/fa/clips/common_voice_fa_...
4حالا، این موضوع به ما فرصت ایجاد چند سناریو را.../home/m3hrdadfi/data/fa/augs/common_voice_fa_1.../home/m3hrdadfi/data/fa/clips/common_voice_fa_...
\n
", "text/plain": " sentence \\\n0 کدامیک ارزان تر است؟ \n1 آیا قرمز را بیشتر از آبی دوست داری؟ \n2 من می خوام کمک کنم \n3 در آفریقای جنوبی، برنامهای به نام دختران تکنو هست \n4 حالا، این موضوع به ما فرصت ایجاد چند سناریو را... \n\n path \\\n0 /home/m3hrdadfi/data/fa/augs/common_voice_fa_2... \n1 /home/m3hrdadfi/data/fa/augs/common_voice_fa_2... \n2 /home/m3hrdadfi/data/fa/augs/common_voice_fa_1... \n3 /home/m3hrdadfi/data/fa/augs/common_voice_fa_1... \n4 /home/m3hrdadfi/data/fa/augs/common_voice_fa_1... \n\n _path \n0 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n1 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n2 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n3 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n4 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... " }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# aug_train_df = new_train_df.copy()\n", "aug_train_df = new_train_df.sample(frac=0.1)\n", "aug_train_df = aug_train_df.reset_index(drop=True)\n", "aug_train_df[\"_path\"] = aug_train_df[\"path\"]\n", "aug_train_df[\"path\"] = aug_train_df[\"path\"].apply(lambda t: \"/\".join(t.split('.')[:-1]).replace(\"clips\", \"augs\") + \"_aug.mp3.wav\")\n", "print(aug_train_df.info())\n", "aug_train_df.head()" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/m3hrdadfi/data/fa/clips/common_voice_fa_20109281.mp3\n", "/home/m3hrdadfi/data/fa/augs/common_voice_fa_20109281_aug.mp3.wav\n" ] } ], "source": [ "print(aug_train_df.iloc[0][\"_path\"])\n", "print(aug_train_df.iloc[0][\"path\"])" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "# augmentation\n", "\n", "from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift, Gain\n", "import numpy as np\n", "import soundfile as sf\n", "\n", "augment = Compose([\n", "# AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),\n", "# PitchShift(min_semitones=-1, max_semitones=2, p=0.2),\n", "# Gain(min_gain_in_db=-6, max_gain_in_db=6, p=0.8)\n", " AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),\n", " TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),\n", " PitchShift(min_semitones=-4, max_semitones=4, p=0.5),\n", "])\n", "\n", "def augmented_speech_file_to_array_fn(in_path, out_path):\n", " speech_array, sampling_rate = torchaudio.load(in_path)\n", " speech_array = speech_array.squeeze().numpy()\n", " speech_array = augment(samples=speech_array, sample_rate=sampling_rate)\n", " sf.write(out_path, speech_array, sampling_rate, \"PCM_24\")" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 26896/26896 [1:18:09<00:00, 5.74it/s]\n" ] } ], "source": [ "for index, row in tqdm(aug_train_df.iterrows(), position=0, total=len(aug_train_df)):\n", " augmented_speech_file_to_array_fn(row[\"_path\"], row[\"path\"])\n", "# !ls" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 295852 entries, 0 to 295851\n", "Data columns (total 3 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 sentence 295852 non-null object\n", " 1 path 295852 non-null object\n", " 2 _path 295852 non-null object\n", "dtypes: object(3)\n", "memory usage: 6.8+ MB\n", "None\n" ] }, { "data": { "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
sentencepath_path
0اما دیدم نه،هیچ جوره نمیتونم ببخشمش به خدا گفت.../home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
1برای امروز./home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
2چون اگر میدانیم چیزی که بیگناه در نظر میگیریم .../home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
3ضمیر من را بدانید -- آقا، خانم، ایشان/home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
4تا تقویت و تکثیرشان کنیم/home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
\n
", "text/plain": " sentence \\\n0 اما دیدم نه،هیچ جوره نمیتونم ببخشمش به خدا گفت... \n1 برای امروز. \n2 چون اگر میدانیم چیزی که بیگناه در نظر میگیریم ... \n3 ضمیر من را بدانید -- آقا، خانم، ایشان \n4 تا تقویت و تکثیرشان کنیم \n\n path \\\n0 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n1 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n2 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n3 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n4 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n\n _path \n0 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n1 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n2 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n3 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n4 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... " }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_train_aug_df = pd.concat([new_train_df, aug_train_df], axis=0)\n", "# new_train_aug_df = new_train_df.copy()\n", "new_train_aug_df = new_train_aug_df.sample(frac=1)\n", "new_train_aug_df = new_train_aug_df.reset_index(drop=True)\n", "print(new_train_aug_df.info())\n", "new_train_aug_df.head()" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/plain": "'/home/m3hrdadfi/data/fa'" }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "save_path" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "new_train_df.to_csv(f\"{save_path}/train_no_aug.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)\n", "new_train_aug_df.to_csv(f\"{save_path}/train_with_aug.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)\n", "new_test_df.to_csv(f\"{save_path}/test.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/plain": "sentence 268956\npath 268956\n_path 268956\ndtype: int64" }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_train_df.count()" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/plain": "sentence 5213\npath 5213\n_path 5213\ndtype: int64" }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_test_df.count()" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "import os\n", "from tqdm import tqdm" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 268956 entries, 0 to 268955\n", "Data columns (total 3 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 sentence 268956 non-null object\n", " 1 path 268956 non-null object\n", " 2 _path 268956 non-null object\n", "dtypes: object(3)\n", "memory usage: 6.2+ MB\n", "None\n" ] }, { "data": { "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
sentencepath_path
0رآس ، اینجا چه خبره ؟/home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
1ممکن است آن را تعمیر کنید وقتی منتظر هستم؟/home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
2دلم برای تو تنگ شده است./home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
3دارم اتاقم را تمیز میکنم./home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
4هاورد باهاتون صحبت کنم/home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
\n
", "text/plain": " sentence \\\n0 رآس ، اینجا چه خبره ؟ \n1 ممکن است آن را تعمیر کنید وقتی منتظر هستم؟ \n2 دلم برای تو تنگ شده است. \n3 دارم اتاقم را تمیز میکنم. \n4 هاورد باهاتون صحبت کنم \n\n path \\\n0 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n1 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n2 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n3 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n4 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n\n _path \n0 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n1 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n2 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n3 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n4 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... " }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_df = pd.read_csv(f\"{save_path}/train_no_aug.csv\", sep=\"\\t\")\n", "print(train_df.info())\n", "train_df.head()" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 5213 entries, 0 to 5212\n", "Data columns (total 3 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 sentence 5213 non-null object\n", " 1 path 5213 non-null object\n", " 2 _path 5213 non-null object\n", "dtypes: object(3)\n", "memory usage: 122.3+ KB\n", "None\n" ] }, { "data": { "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
sentencepath_path
0از مهمونداری کنار بکشم/home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
1برو از مهرداد بپرس./home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
2خب ، تو چیكار می كنی؟/home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
3مسقط پایتخت عمان در عربی به معنای محل سقوط است/home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
4آه، نه اصلاُ!/home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
\n
", "text/plain": " sentence \\\n0 از مهمونداری کنار بکشم \n1 برو از مهرداد بپرس. \n2 خب ، تو چیكار می كنی؟ \n3 مسقط پایتخت عمان در عربی به معنای محل سقوط است \n4 آه، نه اصلاُ! \n\n path \\\n0 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n1 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n2 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n3 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n4 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n\n _path \n0 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n1 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n2 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n3 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n4 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... " }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_df = pd.read_csv(f\"{save_path}/test.csv\", sep=\"\\t\")\n", "print(test_df.info())\n", "test_df.head()" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 268956/268956 [00:11<00:00, 24344.12it/s]\n" ] } ], "source": [ "non_existed_train = []\n", "\n", "for index, row in tqdm(train_df.iterrows(), total=len(train_df), position=0):\n", " if not os.path.exists(row[\"path\"]):\n", " non_existed_train.extends(list(index))\n", " break" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/plain": "[]" }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "non_existed_train" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "# import numpy as np\n", "\n", "\n", "# idx = np.random.randint(0, len(train_df))\n", "# # idx = 6140\n", "# sample = train_df.iloc[idx]\n", "# ipd.display(sample)\n", "# # print(sample.iloc[idx][\"prev_sentence\"])\n", "# print()\n", "# print(sample[\"prev_sentence\"])\n", "# print(sample[\"sentence\"])\n", "# print()\n", "# load_audio(sample[\"path\"])" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "# train_df_half = train_df.copy()\n", "# print(train_df_half.shape)\n", "# train_df_half = train_df_half.dropna()\n", "# print(train_df_half.shape)\n", "# train_df_half = train_df_half.drop_duplicates()\n", "# print(train_df_half.shape)\n", "\n", "# train_df_half = train_df_half.sample(frac=0.5)\n", "# train_df_half = train_df_half.reset_index(drop=True)\n", "# print(train_df_half.shape)" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "# train_df_half.to_csv(f\"{save_path}/train_no_aug_half.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3.8.10 ('jax-env': venv)", "metadata": { "interpreter": { "hash": "d26705e03f37deada2a9ba7d9c91760e1381e108d31e47ed80b202768ffcaf62" } }, "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" }, "orig_nbformat": 2 }, "nbformat": 4, "nbformat_minor": 2 }