jbetker commited on
Commit
da31baa
1 Parent(s): a05af09

update requirements and some docs

Browse files
Files changed (3) hide show
  1. api.py +13 -25
  2. read.py +3 -1
  3. requirements.txt +2 -1
api.py CHANGED
@@ -21,7 +21,12 @@ from utils.tokenizer import VoiceBpeTokenizer, lev_distance
21
 
22
 
23
  pbar = None
 
 
24
  def download_models():
 
 
 
25
  MODELS = {
26
  'autoregressive.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/autoregressive.pth',
27
  'clvp.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/clvp.pth',
@@ -51,6 +56,9 @@ def download_models():
51
 
52
 
53
  def pad_or_truncate(t, length):
 
 
 
54
  if t.shape[-1] == length:
55
  return t
56
  elif t.shape[-1] < length:
@@ -68,7 +76,10 @@ def load_discrete_vocoder_diffuser(trained_diffusion_steps=4000, desired_diffusi
68
  conditioning_free=cond_free, conditioning_free_k=cond_free_k)
69
 
70
 
71
- def load_conditioning(clip, cond_length=132300):
 
 
 
72
  gap = clip.shape[-1] - cond_length
73
  if gap < 0:
74
  clip = F.pad(clip, pad=(0, abs(gap)))
@@ -79,29 +90,6 @@ def load_conditioning(clip, cond_length=132300):
79
  return mel_clip.unsqueeze(0).cuda()
80
 
81
 
82
- def clip_guided_generation(autoregressive_model, clip_model, conditioning_input, text_input, num_batches, stop_mel_token,
83
- tokens_per_clip_inference=10, clip_results_to_reduce_to=8, **generation_kwargs):
84
- """
85
- Uses a CLVP model trained to associate full text with **partial** audio clips to pick the best generation candidates
86
- every few iterations. The top results are then propagated forward through the generation process. Rinse and repeat.
87
- This is a hybrid between beam search and sampling.
88
- """
89
- token_goal = tokens_per_clip_inference
90
- finished = False
91
- while not finished and token_goal < autoregressive_model.max_mel_tokens:
92
- samples = []
93
- for b in tqdm(range(num_batches)):
94
- codes = autoregressive_model.inference_speech(conditioning_input, text_input, **generation_kwargs)
95
- samples.append(codes)
96
- for batch in samples:
97
- for i in range(batch.shape[0]):
98
- batch[i] = fix_autoregressive_output(batch[i], stop_mel_token, complain=False)
99
- clip_results.append(clip_model(text_input.repeat(batch.shape[0], 1), batch, return_loss=False))
100
- clip_results = torch.cat(clip_results, dim=0)
101
- samples = torch.cat(samples, dim=0)
102
- best_results = samples[torch.topk(clip_results, k=clip_results_to_reduce_to).indices]
103
-
104
-
105
  def fix_autoregressive_output(codes, stop_token, complain=True):
106
  """
107
  This function performs some padding on coded audio that fixes a mismatch issue between what the diffusion model was
@@ -222,7 +210,7 @@ class TextToSpeech:
222
  if not isinstance(voice_samples, list):
223
  voice_samples = [voice_samples]
224
  for vs in voice_samples:
225
- conds.append(load_conditioning(vs))
226
  conds = torch.stack(conds, dim=1)
227
 
228
  diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k)
 
21
 
22
 
23
  pbar = None
24
+
25
+
26
  def download_models():
27
+ """
28
+ Call to download all the models that Tortoise uses.
29
+ """
30
  MODELS = {
31
  'autoregressive.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/autoregressive.pth',
32
  'clvp.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/clvp.pth',
 
56
 
57
 
58
  def pad_or_truncate(t, length):
59
+ """
60
+ Utility function for forcing <t> to have the specified sequence length, whether by clipping it or padding it with 0s.
61
+ """
62
  if t.shape[-1] == length:
63
  return t
64
  elif t.shape[-1] < length:
 
76
  conditioning_free=cond_free, conditioning_free_k=cond_free_k)
77
 
78
 
79
+ def format_conditioning(clip, cond_length=132300):
80
+ """
81
+ Converts the given conditioning signal to a MEL spectrogram and clips it as expected by the models.
82
+ """
83
  gap = clip.shape[-1] - cond_length
84
  if gap < 0:
85
  clip = F.pad(clip, pad=(0, abs(gap)))
 
90
  return mel_clip.unsqueeze(0).cuda()
91
 
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  def fix_autoregressive_output(codes, stop_token, complain=True):
94
  """
95
  This function performs some padding on coded audio that fixes a mismatch issue between what the diffusion model was
 
210
  if not isinstance(voice_samples, list):
211
  voice_samples = [voice_samples]
212
  for vs in voice_samples:
213
+ conds.append(format_conditioning(vs))
214
  conds = torch.stack(conds, dim=1)
215
 
216
  diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k)
read.py CHANGED
@@ -5,10 +5,11 @@ import torch
5
  import torch.nn.functional as F
6
  import torchaudio
7
 
8
- from api import TextToSpeech, load_conditioning
9
  from utils.audio import load_audio, get_voices
10
  from utils.tokenizer import VoiceBpeTokenizer
11
 
 
12
  def split_and_recombine_text(texts, desired_length=200, max_len=300):
13
  # TODO: also split across '!' and '?'. Attempt to keep quotations together.
14
  texts = [s.strip() + "." for s in texts.split('.')]
@@ -26,6 +27,7 @@ def split_and_recombine_text(texts, desired_length=200, max_len=300):
26
  texts.pop(i+1)
27
  return texts
28
 
 
29
  if __name__ == '__main__':
30
  parser = argparse.ArgumentParser()
31
  parser.add_argument('--textfile', type=str, help='A file containing the text to read.', default="data/riding_hood.txt")
 
5
  import torch.nn.functional as F
6
  import torchaudio
7
 
8
+ from api import TextToSpeech, format_conditioning
9
  from utils.audio import load_audio, get_voices
10
  from utils.tokenizer import VoiceBpeTokenizer
11
 
12
+
13
  def split_and_recombine_text(texts, desired_length=200, max_len=300):
14
  # TODO: also split across '!' and '?'. Attempt to keep quotations together.
15
  texts = [s.strip() + "." for s in texts.split('.')]
 
27
  texts.pop(i+1)
28
  return texts
29
 
30
+
31
  if __name__ == '__main__':
32
  parser = argparse.ArgumentParser()
33
  parser.add_argument('--textfile', type=str, help='A file containing the text to read.', default="data/riding_hood.txt")
requirements.txt CHANGED
@@ -6,4 +6,5 @@ tokenizers
6
  inflect
7
  progressbar
8
  einops
9
- unidecode
 
 
6
  inflect
7
  progressbar
8
  einops
9
+ unidecode
10
+ entmax