{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "tortoise-tts.ipynb", "provenance": [], "collapsed_sections": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "source": [ "Welcome to Tortoise! 🐒🐒🐒🐒\n", "\n", "Before you begin, I **strongly** recommend you turn on a GPU runtime.\n", "\n", "There's a reason this is called \"Tortoise\" - this model takes up to a minute to perform inference for a single sentence on a GPU. Expect waits on the order of hours on a CPU." ], "metadata": { "id": "_pIZ3ZXNp7cf" } }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "JrK20I32grP6" }, "outputs": [], "source": [ "!git clone https://github.com/neonbjb/tortoise-tts.git\n", "%cd tortoise-tts\n", "!pip install -r requirements.txt" ] }, { "cell_type": "code", "source": [ "# Imports used through the rest of the notebook.\n", "import torch\n", "import torchaudio\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "\n", "from api import TextToSpeech\n", "from utils.audio import load_audio, get_voices\n", "\n", "# This will download all the models used by Tortoise from the HF hub.\n", "tts = TextToSpeech()" ], "metadata": { "id": "Gen09NM4hONQ" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# List all the voices available. These are just some random clips I've gathered\n", "# from the internet as well as a few voices from the training dataset.\n", "# Feel free to add your own clips to the voices/ folder.\n", "%ls voices" ], "metadata": { "id": "SSleVnRAiEE2" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# This is the text that will be spoken.\n", "text = \"Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?\"\n", "\n", "# Here's something for the poetically inclined.. (set text=)\n", "\"\"\"\n", "Then took the other, as just as fair,\n", "And having perhaps the better claim,\n", "Because it was grassy and wanted wear;\n", "Though as for that the passing there\n", "Had worn them really about the same,\"\"\"\n", "\n", "# Pick one of the voices from above\n", "voice = 'dotrice'\n", "# Pick a \"preset mode\" to determine quality. Options: {\"ultra_fast\", \"fast\" (default), \"standard\", \"high_quality\"}. See docs in api.py\n", "preset = \"fast\"" ], "metadata": { "id": "bt_aoxONjfL2" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Fetch the voice references and forward execute!\n", "voices = get_voices()\n", "cond_paths = voices[voice]\n", "conds = []\n", "for cond_path in cond_paths:\n", " c = load_audio(cond_path, 22050)\n", " conds.append(c)\n", "\n", "gen = tts.tts_with_preset(text, conds, preset)\n", "torchaudio.save('generated.wav', gen.squeeze(0).cpu(), 24000)" ], "metadata": { "id": "KEXOKjIvn6NW" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# You can add as many conditioning voices as you want together. Combining\n", "# clips from multiple voices takes the mean of the latent space for all\n", "# voices. This creates a novel voice that is a combination of the two inputs.\n", "#\n", "# Lets see what it would sound like if Picard and Kirk had a kid with a penchant for philosophy:\n", "conds = []\n", "for v in ['pat', 'william']:\n", " cond_paths = voices[v]\n", " for cond_path in cond_paths:\n", " c = load_audio(cond_path, 22050)\n", " conds.append(c)\n", "\n", "gen = tts.tts_with_preset(\"They used to say that if man was meant to fly, he’d have wings. But he did fly. He discovered he had to.\", conds, preset)\n", "torchaudio.save('captain_kirkard.wav', gen.squeeze(0).cpu(), 24000)" ], "metadata": { "id": "fYTk8KUezUr5" }, "execution_count": null, "outputs": [] } ] }