{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "tortoise-tts.ipynb", "provenance": [], "collapsed_sections": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "source": [ "Welcome to Tortoise! 🐒🐒🐒🐒\n", "\n", "Before you begin, I **strongly** recommend you turn on a GPU runtime.\n", "\n", "There's a reason this is called \"Tortoise\" - this model takes up to a minute to perform inference for a single sentence on a GPU. Expect waits on the order of hours on a CPU." ], "metadata": { "id": "_pIZ3ZXNp7cf" } }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "JrK20I32grP6", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "44f55dca-5d0a-405e-a4cc-54bc8e16b780" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Cloning into 'tortoise-tts'...\n", "remote: Enumerating objects: 736, done.\u001b[K\n", "remote: Counting objects: 100% (23/23), done.\u001b[K\n", "remote: Compressing objects: 100% (15/15), done.\u001b[K\n", "remote: Total 736 (delta 10), reused 20 (delta 8), pack-reused 713\u001b[K\n", "Receiving objects: 100% (736/736), 348.62 MiB | 24.08 MiB/s, done.\n", "Resolving deltas: 100% (161/161), done.\n", "/content/tortoise-tts\n", "Requirement already satisfied: torch in /usr/local/lib/python3.7/dist-packages (from -r requirements.txt (line 1)) (1.10.0+cu111)\n", "Requirement already satisfied: torchaudio in /usr/local/lib/python3.7/dist-packages (from -r requirements.txt (line 2)) (0.10.0+cu111)\n", "Collecting rotary_embedding_torch\n", " Downloading rotary_embedding_torch-0.1.5-py3-none-any.whl (4.1 kB)\n", "Collecting transformers\n", " Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)\n", "\u001b[K |β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 4.0 MB 5.3 MB/s \n", "\u001b[?25hCollecting tokenizers\n", " Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)\n", "\u001b[K |β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 6.6 MB 31.3 MB/s \n", "\u001b[?25hRequirement already satisfied: inflect in /usr/local/lib/python3.7/dist-packages (from -r requirements.txt (line 6)) (2.1.0)\n", "Collecting progressbar\n", " Downloading progressbar-2.5.tar.gz (10 kB)\n", "Collecting einops\n", " Downloading einops-0.4.1-py3-none-any.whl (28 kB)\n", "Collecting unidecode\n", " Downloading Unidecode-1.3.4-py3-none-any.whl (235 kB)\n", "\u001b[K |β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 235 kB 44.3 MB/s \n", "\u001b[?25hCollecting entmax\n", " Downloading entmax-1.0.tar.gz (7.2 kB)\n", "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from torch->-r requirements.txt (line 1)) (4.1.1)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers->-r requirements.txt (line 4)) (4.64.0)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.7/dist-packages (from transformers->-r requirements.txt (line 4)) (21.3)\n", "Collecting sacremoses\n", " Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)\n", "\u001b[K |β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 895 kB 36.6 MB/s \n", "\u001b[?25hCollecting huggingface-hub<1.0,>=0.1.0\n", " Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)\n", "\u001b[K |β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 77 kB 6.3 MB/s \n", "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers->-r requirements.txt (line 4)) (3.6.0)\n", "Collecting pyyaml>=5.1\n", " Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)\n", "\u001b[K |β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 596 kB 38.9 MB/s \n", "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers->-r requirements.txt (line 4)) (1.21.6)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers->-r requirements.txt (line 4)) (2.23.0)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers->-r requirements.txt (line 4)) (2019.12.20)\n", "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from transformers->-r requirements.txt (line 4)) (4.11.3)\n", "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=20.0->transformers->-r requirements.txt (line 4)) (3.0.8)\n", "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->transformers->-r requirements.txt (line 4)) (3.8.0)\n", "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers->-r requirements.txt (line 4)) (1.24.3)\n", "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers->-r requirements.txt (line 4)) (3.0.4)\n", "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers->-r requirements.txt (line 4)) (2.10)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers->-r requirements.txt (line 4)) (2021.10.8)\n", "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers->-r requirements.txt (line 4)) (1.15.0)\n", "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers->-r requirements.txt (line 4)) (1.1.0)\n", "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers->-r requirements.txt (line 4)) (7.1.2)\n", "Building wheels for collected packages: progressbar, entmax\n", " Building wheel for progressbar (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for progressbar: filename=progressbar-2.5-py3-none-any.whl size=12082 sha256=bb7d90605d0bf4d89aedc46bd8ed39538f55e00ee70fa382c1af81f142f08fa8\n", " Stored in directory: /root/.cache/pip/wheels/f0/fd/1f/3e35ed57e94cd8ced38dd46771f1f0f94f65fec548659ed855\n", " Building wheel for entmax (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for entmax: filename=entmax-1.0-py3-none-any.whl size=11015 sha256=5e2cf723e790ec941984d2030eb3231e1ae3ce75231709391a13edcd2bfb4770\n", " Stored in directory: /root/.cache/pip/wheels/f7/e8/0d/acc29c2f66e69a1f42483347fa8545c293dec12325ee161716\n", "Successfully built progressbar entmax\n", "Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, einops, unidecode, transformers, rotary-embedding-torch, progressbar, entmax\n", " Attempting uninstall: pyyaml\n", " Found existing installation: PyYAML 3.13\n", " Uninstalling PyYAML-3.13:\n", " Successfully uninstalled PyYAML-3.13\n", "Successfully installed einops-0.4.1 entmax-1.0 huggingface-hub-0.5.1 progressbar-2.5 pyyaml-6.0 rotary-embedding-torch-0.1.5 sacremoses-0.0.49 tokenizers-0.12.1 transformers-4.18.0 unidecode-1.3.4\n" ] } ], "source": [ "!git clone https://github.com/neonbjb/tortoise-tts.git\n", "%cd tortoise-tts\n", "!pip install -r requirements.txt" ] }, { "cell_type": "code", "source": [ "# Imports used through the rest of the notebook.\n", "import torch\n", "import torchaudio\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "\n", "from api import TextToSpeech\n", "from utils.audio import load_audio, get_voices\n", "\n", "# This will download all the models used by Tortoise from the HF hub.\n", "tts = TextToSpeech()" ], "metadata": { "id": "Gen09NM4hONQ", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "35c1fb4b-5998-4e75-9ec9-29521b301db6" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Downloading autoregressive.pth from https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/autoregressive.pth...\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "Done.\n", "Downloading clvp.pth from https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/clvp.pth...\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "Done.\n", "Downloading cvvp.pth from https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/cvvp.pth...\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "Done.\n", "Downloading diffusion_decoder.pth from https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/diffusion_decoder.pth...\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "Done.\n", "Downloading vocoder.pth from https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/vocoder.pth...\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "Done.\n", "Removing weight norm...\n" ] } ] }, { "cell_type": "code", "source": [ "# List all the voices available. These are just some random clips I've gathered\n", "# from the internet as well as a few voices from the training dataset.\n", "# Feel free to add your own clips to the voices/ folder.\n", "%ls voices" ], "metadata": { "id": "SSleVnRAiEE2", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "e1eb09e2-1b68-4f81-b679-edb97538da39" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\u001b[0m\u001b[01;34mangelina_jolie\u001b[0m/ \u001b[01;34mhalle_barry\u001b[0m/ \u001b[01;34mlj\u001b[0m/ \u001b[01;34msamuel_jackson\u001b[0m/\n", "\u001b[01;34matkins\u001b[0m/ \u001b[01;34mharris\u001b[0m/ \u001b[01;34mmol\u001b[0m/ \u001b[01;34msigourney_weaver\u001b[0m/\n", "\u001b[01;34mcarlin\u001b[0m/ \u001b[01;34mhenry_cavill\u001b[0m/ \u001b[01;34mmorgan_freeman\u001b[0m/ \u001b[01;34mtom_hanks\u001b[0m/\n", "\u001b[01;34mdaniel_craig\u001b[0m/ \u001b[01;34mjennifer_lawrence\u001b[0m/ \u001b[01;34mmyself\u001b[0m/ \u001b[01;34mwilliam_shatner\u001b[0m/\n", "\u001b[01;34mdotrice\u001b[0m/ \u001b[01;34mjohn_krasinski\u001b[0m/ \u001b[01;34motto\u001b[0m/\n", "\u001b[01;34memma_stone\u001b[0m/ \u001b[01;34mkennard\u001b[0m/ \u001b[01;34mpatrick_stewart\u001b[0m/\n", "\u001b[01;34mgrace\u001b[0m/ \u001b[01;34mlescault\u001b[0m/ \u001b[01;34mrobert_deniro\u001b[0m/\n" ] } ] }, { "cell_type": "code", "source": [ "# This is the text that will be spoken.\n", "text = \"Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?\"\n", "\n", "# Here's something for the poetically inclined.. (set text=)\n", "\"\"\"\n", "Then took the other, as just as fair,\n", "And having perhaps the better claim,\n", "Because it was grassy and wanted wear;\n", "Though as for that the passing there\n", "Had worn them really about the same,\"\"\"\n", "\n", "# Pick one of the voices from above\n", "voice = 'dotrice'\n", "# Pick a \"preset mode\" to determine quality. Options: {\"ultra_fast\", \"fast\" (default), \"standard\", \"high_quality\"}. See docs in api.py\n", "preset = \"fast\"" ], "metadata": { "id": "bt_aoxONjfL2" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Fetch the voice references and forward execute!\n", "voices = get_voices()\n", "cond_paths = voices[voice]\n", "conds = []\n", "for cond_path in cond_paths:\n", " c = load_audio(cond_path, 22050)\n", " conds.append(c)\n", "\n", "gen = tts.tts_with_preset(text, conds, preset)\n", "torchaudio.save('generated.wav', gen.squeeze(0).cpu(), 24000)" ], "metadata": { "id": "KEXOKjIvn6NW", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "7977bfd7-9fbc-41f7-d3ac-25fd4e350049" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 6/6 [01:18<00:00, 13.11s/it]\n", "/usr/local/lib/python3.7/dist-packages/torch/utils/checkpoint.py:25: UserWarning: None of the inputs have requires_grad=True. Gradients will be None\n", " warnings.warn(\"None of the inputs have requires_grad=True. Gradients will be None\")\n", "/content/tortoise-tts/models/autoregressive.py:359: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').\n", " mel_lengths = wav_lengths // self.mel_length_compression\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "Performing vocoding..\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 32/32 [00:16<00:00, 1.94it/s]\n" ] } ] }, { "cell_type": "code", "source": [ "# You can add as many conditioning voices as you want together. Combining\n", "# clips from multiple voices takes the mean of the latent space for all\n", "# voices. This creates a novel voice that is a combination of the two inputs.\n", "#\n", "# Lets see what it would sound like if Picard and Kirk had a kid with a penchant for philosophy:\n", "conds = []\n", "for v in ['patrick_stewart', 'william_shatner']:\n", " cond_paths = voices[v]\n", " for cond_path in cond_paths:\n", " c = load_audio(cond_path, 22050)\n", " conds.append(c)\n", "\n", "gen = tts.tts_with_preset(\"They used to say that if man was meant to fly, he’d have wings. But he did fly. He discovered he had to.\", conds, preset)\n", "torchaudio.save('captain_kirkard.wav', gen.squeeze(0).cpu(), 24000)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "fYTk8KUezUr5", "outputId": "8a07f251-c90f-4e6a-c204-132b737dfff8" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 6/6 [01:45<00:00, 17.62s/it]\n", "/usr/local/lib/python3.7/dist-packages/torch/utils/checkpoint.py:25: UserWarning: None of the inputs have requires_grad=True. Gradients will be None\n", " warnings.warn(\"None of the inputs have requires_grad=True. Gradients will be None\")\n", "/content/tortoise-tts/models/autoregressive.py:359: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').\n", " mel_lengths = wav_lengths // self.mel_length_compression\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "Performing vocoding..\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 32/32 [00:16<00:00, 2.00it/s]\n" ] } ] } ] }