File size: 4,507 Bytes
a9482ab
 
b87f08b
7f6563e
cab646b
7f6563e
510f17f
aeceb48
7f6563e
8772ca9
7f6563e
d59ee2f
 
9630f4e
dd29aa4
1a8cc73
e28cac3
a9482ab
a7d0893
cab646b
9630f4e
 
55ef1e7
a9482ab
 
 
 
 
 
 
 
 
 
 
 
 
 
63ced49
 
a9482ab
7663e41
a9482ab
 
132c7ea
2b65d86
d2e0f91
b7d4e28
d2e0f91
 
 
b7d4e28
d2e0f91
dd29aa4
 
 
1a8cc73
 
 
 
 
 
 
 
9c77c78
d9f9ad4
1a8cc73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b9de5e1
 
 
1a8cc73
 
 
 
 
 
d2e0f91
91eda71
d9f9ad4
1256bad
 
 
d9f9ad4
 
bfae475
1a8cc73
 
 
 
 
 
 
 
 
 
 
 
 
 
66ca704
1a8cc73
 
 
 
 
 
 
 
a0ece8c
1a8cc73
 
 
8622a01
ad2aa2e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import IPython

import sys
import subprocess
import os

subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "--force-reinstall", "git+https://github.com/osanseviero/tortoise-tts.git"])

# entmax could not be installed at same time as torch
subprocess.check_call([sys.executable, "-m", "pip", "install", "entmax"])

from tortoise_tts.api import TextToSpeech
from tortoise_tts.utils.audio import load_audio, get_voices
import torch 
import torchaudio
import numpy as np
import gradio as gr

device = "cuda" if torch.cuda.is_available() else "cpu"
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# This will download all the models used by Tortoise from the HF hub
tts = TextToSpeech(device="cuda")

voices = [
  "angie",
  "daniel",
  "deniro",
  "emma",
  "freeman",
  "geralt",
  "halle",
  "jlaw",
  "lj",
  "snakes",
  "William",
]
voice_paths = get_voices()
print(voice_paths)

preset = "fast"

def inference(text, voice):
    text = text[:256]
    cond_paths = voice_paths[voice]
    conds = []
    print(voice_paths, voice, cond_paths)
    for cond_path in cond_paths:
        c = load_audio(cond_path, 22050)
        conds.append(c)
    print(text, conds, preset)
    gen = tts.tts_with_preset(text, conds, preset)
    print("gen")
    torchaudio.save('generated.wav', gen.squeeze(0).cpu(), 24000)
    return "generated.wav"

def load_audio_special(sr, data):
    if data.dtype == np.int32:
        norm_fix = 2 ** 31
    elif data.dtype == np.int16:
        norm_fix = 2 ** 15
    elif data.dtype == np.float16 or data.dtype == np.float32:
        norm_fix = 1.
    audio = torch.FloatTensor(data.astype(np.float32)) / norm_fix
    
    # Remove any channel data.
    if len(audio.shape) > 1:
        if audio.shape[0] < 5:
            audio = audio[0]
        else:
            assert audio.shape[1] < 5
            audio = audio[:, 0]

    # Check some assumptions about audio range. This should be automatically fixed in load_wav_to_torch, but might not be in some edge cases, where we should squawk.
    # '2' is arbitrarily chosen since it seems like audio will often "overdrive" the [-1,1] bounds.
    if torch.any(audio > 2) or not torch.any(audio < 0):
        print(f"Error with {audiopath}. Max={audio.max()} min={audio.min()}")
    audio.clip_(-1, 1)
    return audio.unsqueeze(0)
    
def inference_own_voice(text, voice_1, voice_2, voice_3):
    text = text[:256]
    print(voice_1)
    conds = [
        load_audio_special(voice_1[0], voice_1[1]),
        load_audio_special(voice_2[0], voice_2[1]),
        load_audio_special(voice_3[0], voice_3[1]),
    ]
    print(text, conds, preset)
    gen = tts.tts_with_preset(text, conds, preset)
    print("gen")
    torchaudio.save('generated.wav', gen.squeeze(0).cpu(), 24000)
    return "generated.wav"
 
text = "Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?"
examples = [
    [text, "angie"],
    [text, "emma"],
    ["how are you doing this day", "freeman"]
]

block = gr.Blocks(enable_queue=True)
with block:
    gr.Markdown("# TorToiSe")
    gr.Markdown("A multi-voice TTS system trained with an emphasis on quality")
    with gr.Tabs():
        with gr.TabItem("Pre-recorded voices"):
            iface = gr.Interface(
                inference,
                inputs=[
                    gr.inputs.Textbox(type="str", default=text, label="Text", lines=3),
                    gr.inputs.Dropdown(voices),
                ],
                outputs="audio",
                examples=examples,
            )
        with gr.TabItem("Record your voice (experimental, might not work well)"):
            iface = gr.Interface(
              inference_own_voice,
              inputs=[
                  gr.inputs.Textbox(type="str", default=text, label="Text", lines=3),
                  gr.inputs.Audio(source="microphone", label="Record yourself reading something out loud (audio 1)", type="numpy"),
                  gr.inputs.Audio(source="microphone", label="Record yourself reading something out loud (audio 2)", type="numpy"),
                  gr.inputs.Audio(source="microphone", label="Record yourself reading something out loud (audio 3)", type="numpy"),
              ],
              outputs="audio",
            )

    gr.Markdown("This demo shows the ultra fast option in the TorToiSe system. For more info check the <a href='https://github.com/neonbjb/tortoise-tts' target='_blank'>Repository</a>.",)

    block.launch(debug=True)