{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "Collapsed": "false" }, "outputs": [], "source": [ "# TTS_PATH = \"/home/erogol/projects/\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "Collapsed": "false" }, "outputs": [], "source": [ "import os\n", "import sys\n", "import librosa\n", "import numpy as np\n", "import pandas as pd\n", "from scipy.stats import norm\n", "from tqdm import tqdm_notebook as tqdm\n", "from multiprocessing import Pool\n", "from matplotlib import pylab as plt\n", "from collections import Counter\n", "from TTS.config.shared_configs import BaseDatasetConfig\n", "from TTS.tts.datasets import load_tts_samples\n", "from TTS.tts.datasets.formatters import *\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "Collapsed": "false" }, "outputs": [], "source": [ "NUM_PROC = 8\n", "DATASET_CONFIG = BaseDatasetConfig(\n", " formatter=\"ljspeech\", meta_file_train=\"metadata.csv\", path=\"/absolute/path/to/your/dataset/\"\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def formatter(root_path, meta_file, **kwargs): # pylint: disable=unused-argument\n", " txt_file = os.path.join(root_path, meta_file)\n", " items = []\n", " speaker_name = \"myspeaker\"\n", " with open(txt_file, \"r\", encoding=\"utf-8\") as ttf:\n", " for line in ttf:\n", " cols = line.split(\"|\")\n", " wav_file = os.path.join(root_path, \"wavs\", cols[0] + \".wav\") \n", " text = cols[1]\n", " items.append({\"text\": text, \"audio_file\": wav_file, \"speaker_name\": speaker_name, \"root_path\": root_path})\n", " return items" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "Collapsed": "false" }, "outputs": [], "source": [ "# use your own preprocessor at this stage - TTS/datasets/proprocess.py\n", "train_samples, eval_samples = load_tts_samples(DATASET_CONFIG, eval_split=True, formatter=formatter)\n", "if eval_samples is not None:\n", " items = train_samples + eval_samples\n", "else:\n", " items = train_samples\n", "print(\" > Number of audio files: {}\".format(len(items)))\n", "print(items[1])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "Collapsed": "false" }, "outputs": [], "source": [ "# check wavs if exist\n", "wav_files = []\n", "for item in items:\n", " wav_file = item[\"audio_file\"].strip()\n", " wav_files.append(wav_file)\n", " if not os.path.exists(wav_file):\n", " print(wav_file)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "Collapsed": "false" }, "outputs": [], "source": [ "# show duplicate items\n", "c = Counter(wav_files)\n", "print([item for item, count in c.items() if count > 1])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "item" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "Collapsed": "false" }, "outputs": [], "source": [ "def load_item(item):\n", " text = item[\"text\"].strip()\n", " file_name = item[\"audio_file\"].strip()\n", " audio, sr = librosa.load(file_name, sr=None)\n", " audio_len = len(audio) / sr\n", " text_len = len(text)\n", " return file_name, text, text_len, audio, audio_len\n", "\n", "# This will take a while depending on size of dataset\n", "if NUM_PROC == 1:\n", " data = []\n", " for m in tqdm(items):\n", " data += [load_item(m)]\n", "else:\n", " with Pool(8) as p:\n", " data = list(tqdm(p.imap(load_item, items), total=len(items)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "Collapsed": "false" }, "outputs": [], "source": [ "# count words in the dataset\n", "w_count = Counter()\n", "for item in tqdm(data):\n", " text = item[1].lower().strip()\n", " for word in text.split():\n", " w_count[word] += 1\n", "print(\" > Number of words: {}\".format(len(w_count)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "Collapsed": "false" }, "outputs": [], "source": [ "text_vs_durs = {} # text length vs audio duration\n", "text_len_counter = Counter() # number of sentences with the keyed length\n", "for item in tqdm(data):\n", " text = item[1].lower().strip()\n", " text_len = len(text)\n", " text_len_counter[text_len] += 1\n", " audio_len = item[-1]\n", " try:\n", " text_vs_durs[text_len] += [audio_len]\n", " except:\n", " text_vs_durs[text_len] = [audio_len]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "Collapsed": "false" }, "outputs": [], "source": [ "# text_len vs avg_audio_len, median_audio_len, std_audio_len\n", "text_vs_avg = {}\n", "text_vs_median = {}\n", "text_vs_std = {}\n", "for key, durs in text_vs_durs.items():\n", " text_vs_avg[key] = np.mean(durs)\n", " text_vs_median[key] = np.median(durs)\n", " text_vs_std[key] = np.std(durs)" ] }, { "cell_type": "markdown", "metadata": { "Collapsed": "false" }, "source": [ "### Avg audio length per char" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "Collapsed": "false" }, "outputs": [], "source": [ "for item in data:\n", " if item[-1] < 2:\n", " print(item)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "Collapsed": "false" }, "outputs": [], "source": [ "sec_per_chars = []\n", "for item in data:\n", " text = item[1]\n", " dur = item[-1]\n", " sec_per_char = dur / len(text)\n", " sec_per_chars.append(sec_per_char)\n", "# sec_per_char /= len(data)\n", "# print(sec_per_char)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "Collapsed": "false" }, "outputs": [], "source": [ "mean = np.mean(sec_per_chars)\n", "std = np.std(sec_per_chars)\n", "print(mean)\n", "print(std)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "Collapsed": "false" }, "outputs": [], "source": [ "dist = norm(mean, std)\n", "\n", "# find irregular instances long or short voice durations\n", "for item in data:\n", " text = item[1]\n", " dur = item[-1]\n", " sec_per_char = dur / len(text)\n", " pdf =norm.pdf(sec_per_char)\n", " if pdf < 0.39:\n", " print(item)" ] }, { "cell_type": "markdown", "metadata": { "Collapsed": "false" }, "source": [ "### Plot Dataset Statistics" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "Collapsed": "false" }, "outputs": [], "source": [ "plt.title(\"text length vs mean audio duration\")\n", "plt.scatter(list(text_vs_avg.keys()), list(text_vs_avg.values()))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "Collapsed": "false" }, "outputs": [], "source": [ "plt.title(\"text length vs median audio duration\")\n", "plt.scatter(list(text_vs_median.keys()), list(text_vs_median.values()))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "Collapsed": "false" }, "outputs": [], "source": [ "plt.title(\"text length vs STD\")\n", "plt.scatter(list(text_vs_std.keys()), list(text_vs_std.values()))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "Collapsed": "false" }, "outputs": [], "source": [ "plt.title(\"text length vs # instances\")\n", "plt.scatter(list(text_len_counter.keys()), list(text_len_counter.values()))" ] }, { "cell_type": "markdown", "metadata": { "Collapsed": "false" }, "source": [ "### Check words frequencies" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "Collapsed": "false" }, "outputs": [], "source": [ "w_count_df = pd.DataFrame.from_dict(w_count, orient='index')\n", "w_count_df.sort_values(0, ascending=False, inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "Collapsed": "false", "scrolled": true }, "outputs": [], "source": [ "w_count_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "Collapsed": "false" }, "outputs": [], "source": [ "# check a certain word\n", "w_count_df.at['minute', 0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "Collapsed": "false" }, "outputs": [], "source": [ "# fequency bar plot - it takes time!!\n", "w_count_df.plot.bar()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 4 }