|
import os, sys |
|
import tempfile |
|
import gradio as gr |
|
import numpy as np |
|
from typing import Tuple, List |
|
|
|
|
|
os.system("git clone https://github.com/neonbjb/tortoise-tts.git" |
|
os.system("pip install -r requirements.txt") |
|
import torch |
|
import torchaudio |
|
import torch.nn as nn |
|
import torch.nn.functional as F |
|
|
|
from tortoise.api import TextToSpeech |
|
from tortoise.utils.audio import load_audio, load_voice |
|
|
|
|
|
|
|
tts = TextToSpeech() |
|
|
|
|
|
|
|
VOICES = ["random","train_atkins","train_daws","train_dotrice","train_dreams","train_empire","train_grace","train_kennard","train_lescault","train_mouse","angie","applejack","daniel","deniro","emma","freeman","geralt","halle","jlaw","lj","mol","myself","pat","pat2","rainbow","snakes","tim_reynolds","tom","weaver","william"] |
|
DEFAULT_VOICE = "random" |
|
PRESETS = ["ultra_fast", "fast", "standard", "high_quality"] |
|
DEFAULT_PRESET = "fast" |
|
DEFAULT_TEXT = "Hello, world!" |
|
|
|
README = """# TorToiSe |
|
Tortoise is a text-to-speech model developed by James Betker. It is capable of zero-shot voice cloning from a small set of voice samples. GitHub repo: [neonbjb/tortoise-tts](https://github.com/neonbjb/tortoise-tts). |
|
|
|
## Usage |
|
1. Select a model preset and type the text to speak. |
|
2. Load a voice - either by choosing a preset, uploading audio files, or recording via microphone. Select the option to split audio into chunks if the clips are much longer than 10 seconds each. Follow the guidelines in the [voice customization guide](https://github.com/neonbjb/tortoise-tts#voice-customization-guide). |
|
3. Click **Generate**, and wait - it's called *tortoise* for a reason! |
|
""" |
|
|
|
TORTOISE_SR_IN = 22050 |
|
TORTOISE_SR_OUT = 24000 |
|
|
|
def chunk_audio(t: torch.Tensor, sample_rate: int, chunk_duration_sec: int) -> List[torch.Tensor]: |
|
duration = t.shape[1] / sample_rate |
|
num_chunks = 1 + int(duration/chunk_duration_sec) |
|
chunks = [t[:,(sample_rate*chunk_duration_sec*i):(sample_rate*chunk_duration_sec*(i+1))] for i in range(num_chunks)] |
|
|
|
chunks = [chunk for chunk in chunks if chunk.shape[1]>0] |
|
return chunks |
|
|
|
def tts_main(voice_samples: List[torch.Tensor], text: str, model_preset: str) -> str: |
|
gen = tts.tts_with_preset( |
|
text, |
|
voice_samples=voice_samples, |
|
conditioning_latents=None, |
|
preset=model_preset |
|
) |
|
torchaudio.save("generated.wav", gen.squeeze(0).cpu(), TORTOISE_SR_OUT) |
|
return "generated.wav" |
|
|
|
def tts_from_preset(voice: str, text, model_preset): |
|
voice_samples, _ = load_voice(voice) |
|
return tts_main(voice_samples, text, model_preset) |
|
|
|
def tts_from_files(files: List[tempfile._TemporaryFileWrapper], do_chunk, text, model_preset): |
|
voice_samples = [load_audio(f.name, TORTOISE_SR_IN) for f in files] |
|
if do_chunk: |
|
voice_samples = [chunk for t in voice_samples for chunk in chunk_audio(t, TORTOISE_SR_IN, 10)] |
|
return tts_main(voice_samples, text, model_preset) |
|
|
|
def tts_from_recording(recording: Tuple[int, np.ndarray], do_chunk, text, model_preset): |
|
sample_rate, audio = recording |
|
|
|
norm_fix = 1 |
|
if audio.dtype == np.int32: |
|
norm_fix = 2**31 |
|
elif audio.dtype == np.int16: |
|
norm_fix = 2**15 |
|
audio = torch.FloatTensor(audio.T) / norm_fix |
|
if len(audio.shape) > 1: |
|
|
|
audio = torch.mean(audio, axis=0).unsqueeze(0) |
|
audio = torchaudio.transforms.Resample(sample_rate, TORTOISE_SR_IN)(audio) |
|
if do_chunk: |
|
voice_samples = chunk_audio(audio, TORTOISE_SR_IN, 10) |
|
else: |
|
voice_samples = [audio] |
|
return tts_main(voice_samples, text, model_preset) |
|
|
|
def tts_from_url(audio_url, start_time, end_time, do_chunk, text, model_preset): |
|
os.system(f"yt-dlp -x --audio-format mp3 --force-overwrites {audio_url} -o audio.mp3") |
|
audio = load_audio("audio.mp3", TORTOISE_SR_IN) |
|
audio = audio[:,start_time*TORTOISE_SR_IN:end_time*TORTOISE_SR_IN] |
|
if do_chunk: |
|
voice_samples = chunk_audio(audio, TORTOISE_SR_IN, 10) |
|
else: |
|
voice_samples = [audio] |
|
return tts_main(voice_samples, text, model_preset) |
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
|
gr.Markdown(README) |
|
|
|
preset = gr.Dropdown(PRESETS, label="Model preset", value=DEFAULT_PRESET) |
|
text = gr.Textbox(label="Text to speak", value=DEFAULT_TEXT) |
|
do_chunk_label = "Split audio into chunks? (for audio much longer than 10 seconds.)" |
|
do_chunk_default = True |
|
|
|
with gr.Tab("Choose preset voice"): |
|
inp1 = gr.Dropdown(VOICES, value=DEFAULT_VOICE, label="Preset voice") |
|
btn1 = gr.Button("Generate") |
|
|
|
with gr.Tab("Upload audio"): |
|
inp2 = gr.File(file_count="multiple") |
|
do_chunk2 = gr.Checkbox(label=do_chunk_label, value=do_chunk_default) |
|
btn2 = gr.Button("Generate") |
|
|
|
with gr.Tab("Record audio"): |
|
inp3 = gr.Audio(source="microphone") |
|
do_chunk3 = gr.Checkbox(label=do_chunk_label, value=do_chunk_default) |
|
btn3 = gr.Button("Generate") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
audio_out = gr.Audio() |
|
|
|
btn1.click( |
|
tts_from_preset, |
|
[inp1, text, preset], |
|
[audio_out], |
|
) |
|
btn2.click( |
|
tts_from_files, |
|
[inp2, do_chunk2, text, preset], |
|
[audio_out], |
|
) |
|
btn3.click( |
|
tts_from_recording, |
|
[inp3, do_chunk3, text, preset], |
|
[audio_out], |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
demo.launch(share=true) |