import torch import torchaudio import torch.nn as nn import torch.nn.functional as F import IPython import os import sys os.system("git clone https://github.com/neonbjb/tortoise-tts.git") sys.path.append("tortoise-tts") from api import TextToSpeech from utils.audio import load_audio, get_voices # This will download all the models used by Tortoise from the HF hub. tts = TextToSpeech() voices = [ "angie", "daniel", "deniro", "emma", "freeman", "geralt", "halle", "jlaw", "lj", "snakes", "tom", "William", ] voices = get_voices() preset = "fastest" def inference(text, voice): cond_paths = voices[voice] conds = [] for cond_path in cond_paths: c = load_audio(cond_path, 22050) conds.append(c) gen = tts.tts_with_preset(text, conds, preset) return gen text = "Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?" iface = gr.Interface( generate_tone, inputs=[ gr.inputs.Textbox(type="text", default=text, label="Text"), gr.inputs.Dropdown(voices, type="index"), ], outputs="audio", ) iface.launch()