{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "This notebook computes the average SNR a given Voice Dataset. If the SNR is too low, that might reduce the performance or prevent model to learn. SNR paper can be seen here: https://www.cs.cmu.edu/~robust/Papers/KimSternIS08.pdf\n", "\n", "To use this notebook, you need:\n", "- WADA SNR estimation: http://www.cs.cmu.edu/~robust/archive/algorithms/WADA_SNR_IS_2008/\n", " 1. extract in the same folder as this notebook\n", " 2. under MacOS you'll have to rebuild the executable. In the build folder: 1) remove existing .o files and 2) run make\n", "\n", "\n", "- FFMPEG: ```sudo apt-get install ffmpeg ``` \n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "import glob\n", "import subprocess\n", "import IPython\n", "import soundfile as sf\n", "import numpy as np\n", "from tqdm import tqdm\n", "from multiprocessing import Pool\n", "from matplotlib import pylab as plt\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Set the meta parameters\n", "DATA_PATH = \"/home/erogol/Data/m-ai-labs/de_DE/by_book/female/eva_k/\"\n", "NUM_PROC = 1\n", "CURRENT_PATH = os.getcwd()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ "def compute_file_snr(file_path):\n", " \"\"\" Convert given file to required format with FFMPEG and process with WADA.\"\"\"\n", " _, sr = sf.read(file_path)\n", " new_file = file_path.replace(\".wav\", \"_tmp.wav\")\n", " if sr != 16000:\n", " command = f'ffmpeg -i \"{file_path}\" -ac 1 -acodec pcm_s16le -y -ar 16000 \"{new_file}\"'\n", " else:\n", " command = f'cp \"{file_path}\" \"{new_file}\"'\n", " os.system(command)\n", " command = [f'\"{CURRENT_PATH}/WadaSNR/Exe/WADASNR\"', f'-i \"{new_file}\"', f'-t \"{CURRENT_PATH}/WadaSNR/Exe/Alpha0.400000.txt\"', '-ifmt mswav']\n", " output = subprocess.check_output(\" \".join(command), shell=True)\n", " try:\n", " output = float(output.split()[-3].decode(\"utf-8\"))\n", " except:\n", " raise RuntimeError(\" \".join(command))\n", " os.system(f'rm \"{new_file}\"')\n", " return output, file_path\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "wav_file = \"/home/erogol/Data/LJSpeech-1.1/wavs/LJ001-0001.wav\"\n", "output = compute_file_snr(wav_file)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "wav_files = glob.glob(f\"{DATA_PATH}/**/*.wav\", recursive=True)\n", "print(f\" > Number of wav files {len(wav_files)}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if NUM_PROC == 1:\n", " file_snrs = [None] * len(wav_files) \n", " for idx, wav_file in tqdm(enumerate(wav_files)):\n", " tup = compute_file_snr(wav_file)\n", " file_snrs[idx] = tup\n", "else:\n", " with Pool(NUM_PROC) as pool:\n", " file_snrs = list(tqdm(pool.imap(compute_file_snr, wav_files), total=len(wav_files)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "snrs = [tup[0] for tup in file_snrs]\n", "\n", "error_idxs = np.where(np.isnan(snrs) == True)[0]\n", "error_files = [wav_files[idx] for idx in error_idxs]\n", "\n", "file_snrs = [i for j, i in enumerate(file_snrs) if j not in error_idxs]\n", "file_names = [tup[1] for tup in file_snrs]\n", "snrs = [tup[0] for tup in file_snrs]\n", "file_idxs = np.argsort(snrs)\n", "\n", "\n", "print(f\" > Average SNR of the dataset:{np.mean(snrs)}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def output_snr_with_audio(idx):\n", " file_idx = file_idxs[idx]\n", " file_name = file_names[file_idx]\n", " wav, sr = sf.read(file_name)\n", " # multi channel to single channel\n", " if len(wav.shape) == 2:\n", " wav = wav[:, 0]\n", " print(f\" > {file_name} - snr:{snrs[file_idx]}\")\n", " IPython.display.display(IPython.display.Audio(wav, rate=sr))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# find worse SNR files\n", "N = 10 # number of files to fetch\n", "for i in range(N):\n", " output_snr_with_audio(i)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# find best recordings\n", "N = 10 # number of files to fetch\n", "for i in range(N):\n", " output_snr_with_audio(-i-1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.hist(snrs, bins=100)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }