update requirements and some docs
Browse files- api.py +13 -25
- read.py +3 -1
- requirements.txt +2 -1
api.py
CHANGED
@@ -21,7 +21,12 @@ from utils.tokenizer import VoiceBpeTokenizer, lev_distance
|
|
21 |
|
22 |
|
23 |
pbar = None
|
|
|
|
|
24 |
def download_models():
|
|
|
|
|
|
|
25 |
MODELS = {
|
26 |
'autoregressive.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/autoregressive.pth',
|
27 |
'clvp.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/clvp.pth',
|
@@ -51,6 +56,9 @@ def download_models():
|
|
51 |
|
52 |
|
53 |
def pad_or_truncate(t, length):
|
|
|
|
|
|
|
54 |
if t.shape[-1] == length:
|
55 |
return t
|
56 |
elif t.shape[-1] < length:
|
@@ -68,7 +76,10 @@ def load_discrete_vocoder_diffuser(trained_diffusion_steps=4000, desired_diffusi
|
|
68 |
conditioning_free=cond_free, conditioning_free_k=cond_free_k)
|
69 |
|
70 |
|
71 |
-
def
|
|
|
|
|
|
|
72 |
gap = clip.shape[-1] - cond_length
|
73 |
if gap < 0:
|
74 |
clip = F.pad(clip, pad=(0, abs(gap)))
|
@@ -79,29 +90,6 @@ def load_conditioning(clip, cond_length=132300):
|
|
79 |
return mel_clip.unsqueeze(0).cuda()
|
80 |
|
81 |
|
82 |
-
def clip_guided_generation(autoregressive_model, clip_model, conditioning_input, text_input, num_batches, stop_mel_token,
|
83 |
-
tokens_per_clip_inference=10, clip_results_to_reduce_to=8, **generation_kwargs):
|
84 |
-
"""
|
85 |
-
Uses a CLVP model trained to associate full text with **partial** audio clips to pick the best generation candidates
|
86 |
-
every few iterations. The top results are then propagated forward through the generation process. Rinse and repeat.
|
87 |
-
This is a hybrid between beam search and sampling.
|
88 |
-
"""
|
89 |
-
token_goal = tokens_per_clip_inference
|
90 |
-
finished = False
|
91 |
-
while not finished and token_goal < autoregressive_model.max_mel_tokens:
|
92 |
-
samples = []
|
93 |
-
for b in tqdm(range(num_batches)):
|
94 |
-
codes = autoregressive_model.inference_speech(conditioning_input, text_input, **generation_kwargs)
|
95 |
-
samples.append(codes)
|
96 |
-
for batch in samples:
|
97 |
-
for i in range(batch.shape[0]):
|
98 |
-
batch[i] = fix_autoregressive_output(batch[i], stop_mel_token, complain=False)
|
99 |
-
clip_results.append(clip_model(text_input.repeat(batch.shape[0], 1), batch, return_loss=False))
|
100 |
-
clip_results = torch.cat(clip_results, dim=0)
|
101 |
-
samples = torch.cat(samples, dim=0)
|
102 |
-
best_results = samples[torch.topk(clip_results, k=clip_results_to_reduce_to).indices]
|
103 |
-
|
104 |
-
|
105 |
def fix_autoregressive_output(codes, stop_token, complain=True):
|
106 |
"""
|
107 |
This function performs some padding on coded audio that fixes a mismatch issue between what the diffusion model was
|
@@ -222,7 +210,7 @@ class TextToSpeech:
|
|
222 |
if not isinstance(voice_samples, list):
|
223 |
voice_samples = [voice_samples]
|
224 |
for vs in voice_samples:
|
225 |
-
conds.append(
|
226 |
conds = torch.stack(conds, dim=1)
|
227 |
|
228 |
diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k)
|
|
|
21 |
|
22 |
|
23 |
pbar = None
|
24 |
+
|
25 |
+
|
26 |
def download_models():
|
27 |
+
"""
|
28 |
+
Call to download all the models that Tortoise uses.
|
29 |
+
"""
|
30 |
MODELS = {
|
31 |
'autoregressive.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/autoregressive.pth',
|
32 |
'clvp.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/clvp.pth',
|
|
|
56 |
|
57 |
|
58 |
def pad_or_truncate(t, length):
|
59 |
+
"""
|
60 |
+
Utility function for forcing <t> to have the specified sequence length, whether by clipping it or padding it with 0s.
|
61 |
+
"""
|
62 |
if t.shape[-1] == length:
|
63 |
return t
|
64 |
elif t.shape[-1] < length:
|
|
|
76 |
conditioning_free=cond_free, conditioning_free_k=cond_free_k)
|
77 |
|
78 |
|
79 |
+
def format_conditioning(clip, cond_length=132300):
|
80 |
+
"""
|
81 |
+
Converts the given conditioning signal to a MEL spectrogram and clips it as expected by the models.
|
82 |
+
"""
|
83 |
gap = clip.shape[-1] - cond_length
|
84 |
if gap < 0:
|
85 |
clip = F.pad(clip, pad=(0, abs(gap)))
|
|
|
90 |
return mel_clip.unsqueeze(0).cuda()
|
91 |
|
92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
def fix_autoregressive_output(codes, stop_token, complain=True):
|
94 |
"""
|
95 |
This function performs some padding on coded audio that fixes a mismatch issue between what the diffusion model was
|
|
|
210 |
if not isinstance(voice_samples, list):
|
211 |
voice_samples = [voice_samples]
|
212 |
for vs in voice_samples:
|
213 |
+
conds.append(format_conditioning(vs))
|
214 |
conds = torch.stack(conds, dim=1)
|
215 |
|
216 |
diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k)
|
read.py
CHANGED
@@ -5,10 +5,11 @@ import torch
|
|
5 |
import torch.nn.functional as F
|
6 |
import torchaudio
|
7 |
|
8 |
-
from api import TextToSpeech,
|
9 |
from utils.audio import load_audio, get_voices
|
10 |
from utils.tokenizer import VoiceBpeTokenizer
|
11 |
|
|
|
12 |
def split_and_recombine_text(texts, desired_length=200, max_len=300):
|
13 |
# TODO: also split across '!' and '?'. Attempt to keep quotations together.
|
14 |
texts = [s.strip() + "." for s in texts.split('.')]
|
@@ -26,6 +27,7 @@ def split_and_recombine_text(texts, desired_length=200, max_len=300):
|
|
26 |
texts.pop(i+1)
|
27 |
return texts
|
28 |
|
|
|
29 |
if __name__ == '__main__':
|
30 |
parser = argparse.ArgumentParser()
|
31 |
parser.add_argument('--textfile', type=str, help='A file containing the text to read.', default="data/riding_hood.txt")
|
|
|
5 |
import torch.nn.functional as F
|
6 |
import torchaudio
|
7 |
|
8 |
+
from api import TextToSpeech, format_conditioning
|
9 |
from utils.audio import load_audio, get_voices
|
10 |
from utils.tokenizer import VoiceBpeTokenizer
|
11 |
|
12 |
+
|
13 |
def split_and_recombine_text(texts, desired_length=200, max_len=300):
|
14 |
# TODO: also split across '!' and '?'. Attempt to keep quotations together.
|
15 |
texts = [s.strip() + "." for s in texts.split('.')]
|
|
|
27 |
texts.pop(i+1)
|
28 |
return texts
|
29 |
|
30 |
+
|
31 |
if __name__ == '__main__':
|
32 |
parser = argparse.ArgumentParser()
|
33 |
parser.add_argument('--textfile', type=str, help='A file containing the text to read.', default="data/riding_hood.txt")
|
requirements.txt
CHANGED
@@ -6,4 +6,5 @@ tokenizers
|
|
6 |
inflect
|
7 |
progressbar
|
8 |
einops
|
9 |
-
unidecode
|
|
|
|
6 |
inflect
|
7 |
progressbar
|
8 |
einops
|
9 |
+
unidecode
|
10 |
+
entmax
|