{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "This is a noteboook used to generate the speaker embeddings with the CorentinJ GE2E model trained with Angular Prototypical loss for multi-speaker training.\n", "\n", "Before running this script please DON'T FORGET:\n", "- to set the right paths in the cell below.\n", "\n", "Repositories:\n", "- TTS: https://github.com/mozilla/TTS\n", "- CorentinJ GE2E: https://github.com/Edresson/GE2E-Speaker-Encoder" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "scrolled": true }, "outputs": [], "source": [ "import os\n", "import importlib\n", "import random\n", "import librosa\n", "import torch\n", "\n", "import numpy as np\n", "from TTS.utils.io import load_config\n", "from tqdm import tqdm\n", "from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n", "\n", "# you may need to change this depending on your system\n", "os.environ['CUDA_VISIBLE_DEVICES']='0'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Cloning into 'Real-Time-Voice-Cloning'...\n", "remote: Enumerating objects: 5, done.\u001b[K\n", "remote: Counting objects: 100% (5/5), done.\u001b[K\n", "remote: Compressing objects: 100% (5/5), done.\u001b[K\n", "remote: Total 2508 (delta 0), reused 3 (delta 0), pack-reused 2503\u001b[K\n", "Receiving objects: 100% (2508/2508), 360.78 MiB | 17.84 MiB/s, done.\n", "Resolving deltas: 100% (1387/1387), done.\n", "Checking connectivity... done.\n" ] } ], "source": [ "# Clone encoder \n", "!git clone https://github.com/CorentinJ/Real-Time-Voice-Cloning.git\n", "os.chdir('Real-Time-Voice-Cloning/')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "#Install voxceleb_trainer Requeriments\n", "!python -m pip install umap-learn visdom webrtcvad librosa>=0.5.1 matplotlib>=2.0.2 numpy>=1.14.0 scipy>=1.0.0 tqdm sounddevice Unidecode inflect multiprocess numba" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2020-08-05 06:51:05-- https://github.com/Edresson/Real-Time-Voice-Cloning/releases/download/checkpoints/pretrained.zip\n", "Resolving github.com (github.com)... 18.231.5.6\n", "Connecting to github.com (github.com)|18.231.5.6|:443... connected.\n", "HTTP request sent, awaiting response... 301 Moved Permanently\n", "Location: https://github.com/Edresson/GE2E-Speaker-Encoder/releases/download/checkpoints/pretrained.zip [following]\n", "--2020-08-05 06:51:05-- https://github.com/Edresson/GE2E-Speaker-Encoder/releases/download/checkpoints/pretrained.zip\n", "Reusing existing connection to github.com:443.\n", "HTTP request sent, awaiting response... 302 Found\n", "Location: https://github-production-release-asset-2e65be.s3.amazonaws.com/263893598/f7f31d80-96df-11ea-8345-261fc35f9849?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20200805%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200805T101614Z&X-Amz-Expires=300&X-Amz-Signature=df7724c28668ebd5dfbcc6a9b51f6afb78193c30119f3a1c3eef678188aabd1e&X-Amz-SignedHeaders=host&actor_id=0&repo_id=263893598&response-content-disposition=attachment%3B%20filename%3Dpretrained.zip&response-content-type=application%2Foctet-stream [following]\n", "--2020-08-05 06:51:05-- https://github-production-release-asset-2e65be.s3.amazonaws.com/263893598/f7f31d80-96df-11ea-8345-261fc35f9849?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20200805%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200805T101614Z&X-Amz-Expires=300&X-Amz-Signature=df7724c28668ebd5dfbcc6a9b51f6afb78193c30119f3a1c3eef678188aabd1e&X-Amz-SignedHeaders=host&actor_id=0&repo_id=263893598&response-content-disposition=attachment%3B%20filename%3Dpretrained.zip&response-content-type=application%2Foctet-stream\n", "Resolving github-production-release-asset-2e65be.s3.amazonaws.com (github-production-release-asset-2e65be.s3.amazonaws.com)... 52.216.18.24\n", "Connecting to github-production-release-asset-2e65be.s3.amazonaws.com (github-production-release-asset-2e65be.s3.amazonaws.com)|52.216.18.24|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 383640573 (366M) [application/octet-stream]\n", "Saving to: ‘pretrained.zip’\n", "\n", "pretrained.zip 100%[===================>] 365,87M 6,62MB/s in 56s \n", "\n", "2020-08-05 06:52:03 (6,48 MB/s) - ‘pretrained.zip’ saved [383640573/383640573]\n", "\n", "Archive: pretrained.zip\n", " creating: encoder/saved_models/\n", " inflating: encoder/saved_models/pretrained.pt \n", " creating: synthesizer/saved_models/\n", " creating: synthesizer/saved_models/logs-pretrained/\n", " creating: synthesizer/saved_models/logs-pretrained/taco_pretrained/\n", " extracting: synthesizer/saved_models/logs-pretrained/taco_pretrained/checkpoint \n", " inflating: synthesizer/saved_models/logs-pretrained/taco_pretrained/tacotron_model.ckpt-278000.data-00000-of-00001 \n", " inflating: synthesizer/saved_models/logs-pretrained/taco_pretrained/tacotron_model.ckpt-278000.index \n", " inflating: synthesizer/saved_models/logs-pretrained/taco_pretrained/tacotron_model.ckpt-278000.meta \n", " creating: vocoder/saved_models/\n", " creating: vocoder/saved_models/pretrained/\n", " inflating: vocoder/saved_models/pretrained/pretrained.pt \n" ] } ], "source": [ "#Download encoder Checkpoint\n", "!wget https://github.com/Edresson/Real-Time-Voice-Cloning/releases/download/checkpoints/pretrained.zip\n", "!unzip pretrained.zip" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "from encoder import inference as encoder\n", "from encoder.params_model import model_embedding_size as speaker_embedding_size\n", "from pathlib import Path" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Preparing the encoder, the synthesizer and the vocoder...\n", "Loaded encoder \"pretrained.pt\" trained to step 1564501\n", "Testing your configuration with small inputs.\n", "\tTesting the encoder...\n", "(256,)\n" ] } ], "source": [ "print(\"Preparing the encoder, the synthesizer and the vocoder...\")\n", "encoder.load_model(Path('encoder/saved_models/pretrained.pt'))\n", "print(\"Testing your configuration with small inputs.\")\n", "# Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's\n", "# sampling rate, which may differ.\n", "# If you're unfamiliar with digital audio, know that it is encoded as an array of floats \n", "# (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.\n", "# The sampling rate is the number of values (samples) recorded per second, it is set to\n", "# 16000 for the encoder. Creating an array of length will always correspond \n", "# to an audio of 1 second.\n", "print(\"\\tTesting the encoder...\")\n", "\n", "wav = np.zeros(encoder.sampling_rate) \n", "embed = encoder.embed_utterance(wav)\n", "print(embed.shape)\n", "\n", "# Embeddings are L2-normalized (this isn't important here, but if you want to make your own \n", "# embeddings it will be).\n", "#embed /= np.linalg.norm(embed) # for random embedding\n" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "SAVE_PATH = '../'" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# Set constants\n", "DATASETS_NAME = ['vctk'] # list the datasets\n", "DATASETS_PATH = ['../../../../../datasets/VCTK-Corpus-removed-silence/']\n", "DATASETS_METAFILE = ['']\n", "USE_CUDA = True" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\n", " 0%| | 0/44063 [00:00