diff --git "a/check_sound.ipynb" "b/check_sound.ipynb" new file mode 100644--- /dev/null +++ "b/check_sound.ipynb" @@ -0,0 +1,474 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "9fef90b6", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "################################################################################\n", + "### WARNING, path does not exist: KALDI_ROOT=/mnt/matylda5/iveselyk/Tools/kaldi-trunk\n", + "### (please add 'export KALDI_ROOT=' in your $HOME/.profile)\n", + "### (or run as: KALDI_ROOT= python .py)\n", + "################################################################################\n", + "\n" + ] + } + ], + "source": [ + "from pathlib import Path\n", + "from datasets import load_dataset\n", + "import torchaudio\n", + "import IPython.display as ipd\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "cb52f737", + "metadata": {}, + "outputs": [], + "source": [ + "# Plot arrays\n", + "def show_wave_from_array(speech_array):\n", + " plt.figure()\n", + " plt.ylim([-1.0,1.0])\n", + " plt.plot(speech_array)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c6bacb26", + "metadata": {}, + "outputs": [], + "source": [ + "# Plot audio arrays of an audio filename\n", + "def show_wave(filename, gain=1.0, sampling_rate=0):\n", + " speech_array, sampling_rate_original = torchaudio.load(filename)\n", + " if sampling_rate>0:\n", + " resampler = torchaudio.transforms.Resample(sampling_rate_original, sampling_rate)\n", + " volume = torchaudio.transforms.Vol(gain)\n", + " audio = resampler(volume(speech_array)).squeeze().numpy()\n", + " else:\n", + " audio = speech_array.squeeze().numpy()\n", + " print(sampling_rate_original, len(audio))\n", + " show_wave_from_array(audio)" + ] + }, + { + "cell_type": "markdown", + "id": "f321570d", + "metadata": {}, + "source": [ + "## Common Voice 7 - Turkish" + ] + }, + { + "cell_type": "markdown", + "id": "7ac2553b", + "metadata": {}, + "source": [ + "### We read audio data directly from test set with index 2252" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "3ab2bd46", + "metadata": {}, + "outputs": [], + "source": [ + "audio_index = 2252" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "209a06a5", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Reusing dataset common_voice (/home/cahya/.cache/huggingface/datasets/mozilla-foundation___common_voice/tr/7.0.0/fe20cac47c166e25b1f096ab661832e3da7cf298ed4a91dcaa1343ad972d175b)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "85382657a67a4e489c1521df860cd0d0", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/5 [00:00" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "show_wave_from_array(audio)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "f017f96e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ipd.Audio(data=audio, autoplay=True, rate=sampling_rate)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09fb172e", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "6aee86bb", + "metadata": {}, + "source": [ + "### We read other audio data before we read test set with index 2252. This is how we do normally" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "f2033087", + "metadata": {}, + "outputs": [], + "source": [ + "audio_index = 2252" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "3ffaae25", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Reusing dataset common_voice (/home/cahya/.cache/huggingface/datasets/mozilla-foundation___common_voice/tr/7.0.0/fe20cac47c166e25b1f096ab661832e3da7cf298ed4a91dcaa1343ad972d175b)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7e48d5fcc8af4d5d8c7ccbbf7ca81d46", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/5 [00:00" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "show_wave_from_array(audio)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "9d7f99b6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ipd.Audio(data=audio, autoplay=True, rate=sampling_rate)" + ] + }, + { + "cell_type": "markdown", + "id": "b6377b00", + "metadata": {}, + "source": [ + "The following audio sounds are also affected: 2253,... which have originally a sampling rate of 32kHz. " + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "dec6985a", + "metadata": {}, + "outputs": [], + "source": [ + "audio_index = 2253" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "fa863918", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "path: cv-corpus-7.0-2021-07-21/tr/clips/common_voice_tr_24151645.mp3, sampling rate: 48000, array's length: 219429\n" + ] + } + ], + "source": [ + "audio, sampling_rate = ds_tr_7['test'][audio_index]['audio']['array'], ds_tr_7['test'][audio_index]['audio']['sampling_rate']\n", + "print(f\"path: {ds_tr_7['test'][audio_index]['audio']['path']}, sampling rate: {sampling_rate}, array's length: {len(audio)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "01ecb2fd", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "show_wave_from_array(audio)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "a891f06f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ipd.Audio(data=audio, autoplay=True, rate=sampling_rate)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90dadef0", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}