import gradio as gr import os import sys os.system("git clone https://github.com/neonbjb/tortoise-tts.git") sys.path.append("./tortoise-tts/") os.system("pip install -r ./tortoise-tts/requirements.txt") os.system("python ./tortoise-tts/setup.py install") import torch import torchaudio import torch.nn as nn import torch.nn.functional as F from tortoise.api import TextToSpeech from tortoise.utils.audio import load_audio, load_voice, load_voices tts = TextToSpeech() def main(text, voice, preset): voice_samples, conditioning_latents = load_voice(voice) gen = tts.tts_with_preset(text, voice_samples=voice_samples, conditioning_latents=conditioning_latents, preset=preset) torchaudio.save("generated.wav", gen.squeeze(0).cpu(), 24000) return "generated.wav" voices = ["mol", "tom", "applejack", "daniel", "myself", "weaver", "train_empire", "train_dotrice", "rainbow", "pat", "geralt", "halle", "train_kennard", "jlaw", "train_grace", "angie", "william", "tim_reynolds", "train_atkins", "train_dreams", "train_mouse", "freeman", "deniro", "lj", "train_lescault", "emma", "pat2", "snakes", "train_daws"] presets = ["ultra_fast", "fast", "standard", "high_quality"] gr.Interface( main, [ gr.Textbox(label="Text", placeholder="Text-to-speak goes here..."), gr.Dropdown(voices, value="deniro", label="Voice"), gr.Dropdown(presets, value="ultra_fast", label="Preset"), ], gr.Audio(), description="TorToiSe - a multi-voice TTS system by jbetker | src\nNote: inference is very slow on CPU; for quicker inference times use the Colab notebook linked in the source repository.", enable_queue=True ).launch()