jbetker commited on
Commit
aa5c5df
1 Parent(s): 301bf48
.gitignore CHANGED
@@ -130,3 +130,4 @@ dmypy.json
130
  .pyre/
131
 
132
  .idea/*
 
130
  .pyre/
131
 
132
  .idea/*
133
+ .models/*
.models/clip.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8ab5a7751b6098b7e57528b5d812ea2ffbaa16f1b36c02e143c501c74900140d
3
- size 271601435
 
 
 
api.py CHANGED
@@ -23,9 +23,11 @@ from utils.tokenizer import VoiceBpeTokenizer, lev_distance
23
  pbar = None
24
  def download_models():
25
  MODELS = {
26
- 'clip.pth': 'https://huggingface.co/jbetker/tortoise-tts-clip/resolve/main/pytorch-model.bin',
27
- 'diffusion.pth': 'https://huggingface.co/jbetker/tortoise-tts-diffusion-v1/resolve/main/pytorch-model.bin',
28
- 'autoregressive.pth': 'https://huggingface.co/jbetker/tortoise-tts-autoregressive/resolve/main/pytorch-model.bin'
 
 
29
  }
30
  os.makedirs('.models', exist_ok=True)
31
  def show_progress(block_num, block_size, total_size):
@@ -162,25 +164,12 @@ class TextToSpeech:
162
  train_solo_embeddings=False,
163
  average_conditioning_embeddings=True).cpu().eval()
164
  self.autoregressive.load_state_dict(torch.load('.models/autoregressive.pth'))
165
- '''
166
- self.autoregressive = UnifiedVoice(max_mel_tokens=2048, max_text_tokens=1024, max_conditioning_inputs=1, layers=42,
167
- model_dim=1152, heads=18, number_text_tokens=256, train_solo_embeddings=False,
168
- average_conditioning_embeddings=True, types=2).cpu().eval()
169
- self.autoregressive.load_state_dict(torch.load('X:\\dlas\\experiments\\train_gpt_tts_xl\\models\\15250_gpt_ema.pth'))
170
- '''
171
-
172
- self.autoregressive_for_diffusion = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30,
173
- model_dim=1024,
174
- heads=16, number_text_tokens=255, start_text_token=255, checkpointing=False,
175
- train_solo_embeddings=False,
176
- average_conditioning_embeddings=True).cpu().eval()
177
- self.autoregressive_for_diffusion.load_state_dict(torch.load('.models/autoregressive.pth'))
178
 
179
  self.clvp = CLVP(dim_text=512, dim_speech=512, dim_latent=512, num_text_tokens=256, text_enc_depth=12,
180
  text_seq_len=350, text_heads=8,
181
  num_speech_tokens=8192, speech_enc_depth=12, speech_heads=8, speech_seq_len=430,
182
  use_xformers=True).cpu().eval()
183
- self.clvp.load_state_dict(torch.load('.models/clip.pth'))
184
 
185
  self.cvvp = CVVP(model_dim=512, transformer_heads=8, dropout=0, mel_codes=8192, conditioning_enc_depth=8, cond_mask_percentage=0,
186
  speech_enc_depth=8, speech_mask_percentage=0, latent_multiplier=1).cpu().eval()
@@ -213,7 +202,7 @@ class TextToSpeech:
213
  'ultra_fast': {'num_autoregressive_samples': 32, 'diffusion_iterations': 16, 'cond_free': False},
214
  'fast': {'num_autoregressive_samples': 96, 'diffusion_iterations': 32},
215
  'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 128},
216
- 'high_quality': {'num_autoregressive_samples': 512, 'diffusion_iterations': 2048},
217
  }
218
  kwargs.update(presets[preset])
219
  return self.tts(text, voice_samples, **kwargs)
@@ -281,11 +270,11 @@ class TextToSpeech:
281
  # The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
282
  # inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
283
  # results, but will increase memory usage.
284
- self.autoregressive_for_diffusion = self.autoregressive_for_diffusion.cuda()
285
- best_latents = self.autoregressive_for_diffusion(conds, text, torch.tensor([text.shape[-1]], device=conds.device), best_results,
286
- torch.tensor([best_results.shape[-1]*self.autoregressive_for_diffusion.mel_length_compression], device=conds.device),
287
  return_latent=True, clip_inputs=False)
288
- self.autoregressive_for_diffusion = self.autoregressive_for_diffusion.cpu()
289
 
290
  print("Performing vocoding..")
291
  wav_candidates = []
23
  pbar = None
24
  def download_models():
25
  MODELS = {
26
+ 'autoregressive.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/autoregressive.pth',
27
+ 'clvp.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/clip.pth',
28
+ 'cvvp.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/cvvp.pth',
29
+ 'diffusion_decoder.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/diffusion_decoder.pth',
30
+ 'vocoder.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/vocoder.pth',
31
  }
32
  os.makedirs('.models', exist_ok=True)
33
  def show_progress(block_num, block_size, total_size):
164
  train_solo_embeddings=False,
165
  average_conditioning_embeddings=True).cpu().eval()
166
  self.autoregressive.load_state_dict(torch.load('.models/autoregressive.pth'))
 
 
 
 
 
 
 
 
 
 
 
 
 
167
 
168
  self.clvp = CLVP(dim_text=512, dim_speech=512, dim_latent=512, num_text_tokens=256, text_enc_depth=12,
169
  text_seq_len=350, text_heads=8,
170
  num_speech_tokens=8192, speech_enc_depth=12, speech_heads=8, speech_seq_len=430,
171
  use_xformers=True).cpu().eval()
172
+ self.clvp.load_state_dict(torch.load('.models/clvp.pth'))
173
 
174
  self.cvvp = CVVP(model_dim=512, transformer_heads=8, dropout=0, mel_codes=8192, conditioning_enc_depth=8, cond_mask_percentage=0,
175
  speech_enc_depth=8, speech_mask_percentage=0, latent_multiplier=1).cpu().eval()
202
  'ultra_fast': {'num_autoregressive_samples': 32, 'diffusion_iterations': 16, 'cond_free': False},
203
  'fast': {'num_autoregressive_samples': 96, 'diffusion_iterations': 32},
204
  'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 128},
205
+ 'high_quality': {'num_autoregressive_samples': 512, 'diffusion_iterations': 1024},
206
  }
207
  kwargs.update(presets[preset])
208
  return self.tts(text, voice_samples, **kwargs)
270
  # The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
271
  # inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
272
  # results, but will increase memory usage.
273
+ self.autoregressive = self.autoregressive.cuda()
274
+ best_latents = self.autoregressive(conds, text, torch.tensor([text.shape[-1]], device=conds.device), best_results,
275
+ torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=conds.device),
276
  return_latent=True, clip_inputs=False)
277
+ self.autoregressive = self.autoregressive.cpu()
278
 
279
  print("Performing vocoding..")
280
  wav_candidates = []
data/mel_norms.pth CHANGED
Binary files a/data/mel_norms.pth and b/data/mel_norms.pth differ
do_tts.py CHANGED
@@ -11,6 +11,10 @@ if __name__ == '__main__':
11
  parser.add_argument('--text', type=str, help='Text to speak.', default="I am a language model that has learned to speak.")
12
  parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
13
  'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='patrick_stewart')
 
 
 
 
14
  parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/')
15
  args = parser.parse_args()
16
  os.makedirs(args.output_path, exist_ok=True)
@@ -25,6 +29,6 @@ if __name__ == '__main__':
25
  for cond_path in cond_paths:
26
  c = load_audio(cond_path, 22050)
27
  conds.append(c)
28
- gen = tts.tts_with_preset(args.text, conds, preset='standard')
29
  torchaudio.save(os.path.join(args.output_path, f'{voice}.wav'), gen.squeeze(0).cpu(), 24000)
30
 
11
  parser.add_argument('--text', type=str, help='Text to speak.', default="I am a language model that has learned to speak.")
12
  parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
13
  'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='patrick_stewart')
14
+ parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='standard')
15
+ parser.add_argument('--voice_diversity_intelligibility_slider', type=float,
16
+ help='How to balance vocal diversity with the quality/intelligibility of the spoken text. 0 means highly diverse voice (not recommended), 1 means maximize intellibility',
17
+ default=.5)
18
  parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/')
19
  args = parser.parse_args()
20
  os.makedirs(args.output_path, exist_ok=True)
29
  for cond_path in cond_paths:
30
  c = load_audio(cond_path, 22050)
31
  conds.append(c)
32
+ gen = tts.tts_with_preset(args.text, conds, preset=args.preset, clvp_cvvp_slider=args.voice_diversity_intelligibility_slider)
33
  torchaudio.save(os.path.join(args.output_path, f'{voice}.wav'), gen.squeeze(0).cpu(), 24000)
34
 
models/arch_util.py CHANGED
@@ -5,8 +5,7 @@ import torch
5
  import torch.nn as nn
6
  import torch.nn.functional as F
7
  import torchaudio
8
- from x_transformers import ContinuousTransformerWrapper
9
- from x_transformers.x_transformers import RelativePositionBias
10
 
11
 
12
  def zero_module(module):
5
  import torch.nn as nn
6
  import torch.nn.functional as F
7
  import torchaudio
8
+ from models.xtransformers import ContinuousTransformerWrapper, RelativePositionBias
 
9
 
10
 
11
  def zero_module(module):
models/clvp.py CHANGED
@@ -2,10 +2,10 @@ import torch
2
  import torch.nn as nn
3
  import torch.nn.functional as F
4
  from torch import einsum
5
- from x_transformers import Encoder
6
 
7
  from models.arch_util import CheckpointedXTransformerEncoder
8
  from models.transformer import Transformer
 
9
 
10
 
11
  def exists(val):
2
  import torch.nn as nn
3
  import torch.nn.functional as F
4
  from torch import einsum
 
5
 
6
  from models.arch_util import CheckpointedXTransformerEncoder
7
  from models.transformer import Transformer
8
+ from models.xtransformers import Encoder
9
 
10
 
11
  def exists(val):
models/xtransformers.py CHANGED
@@ -1253,50 +1253,3 @@ class ContinuousTransformerWrapper(nn.Module):
1253
  return tuple(res)
1254
  return res[0]
1255
 
1256
-
1257
- class XTransformer(nn.Module):
1258
- def __init__(
1259
- self,
1260
- *,
1261
- dim,
1262
- tie_token_emb=False,
1263
- **kwargs
1264
- ):
1265
- super().__init__()
1266
- enc_kwargs, kwargs = groupby_prefix_and_trim('enc_', kwargs)
1267
- dec_kwargs, kwargs = groupby_prefix_and_trim('dec_', kwargs)
1268
-
1269
- assert 'dim' not in enc_kwargs and 'dim' not in dec_kwargs, 'dimension of either encoder or decoder must be set with `dim` keyword'
1270
- enc_transformer_kwargs = pick_and_pop(['num_tokens', 'max_seq_len'], enc_kwargs)
1271
- enc_transformer_kwargs['emb_dropout'] = enc_kwargs.pop('emb_dropout', 0)
1272
- enc_transformer_kwargs['num_memory_tokens'] = enc_kwargs.pop('num_memory_tokens', None)
1273
- enc_transformer_kwargs['use_pos_emb'] = enc_kwargs.pop('use_pos_emb', True)
1274
-
1275
- dec_transformer_kwargs = pick_and_pop(['num_tokens', 'max_seq_len'], dec_kwargs)
1276
- dec_transformer_kwargs['emb_dropout'] = dec_kwargs.pop('emb_dropout', 0)
1277
- dec_transformer_kwargs['use_pos_emb'] = dec_kwargs.pop('use_pos_emb', True)
1278
-
1279
- self.encoder = TransformerWrapper(
1280
- **enc_transformer_kwargs,
1281
- attn_layers=Encoder(dim=dim, **enc_kwargs)
1282
- )
1283
-
1284
- self.decoder = TransformerWrapper(
1285
- **dec_transformer_kwargs,
1286
- attn_layers=Decoder(dim=dim, cross_attend=True, **dec_kwargs)
1287
- )
1288
-
1289
- if tie_token_emb:
1290
- self.decoder.token_emb = self.encoder.token_emb
1291
-
1292
- self.decoder = AutoregressiveWrapper(self.decoder)
1293
-
1294
- @torch.no_grad()
1295
- def generate(self, seq_in, seq_out_start, seq_len, src_mask=None, src_attn_mask=None, **kwargs):
1296
- encodings = self.encoder(seq_in, mask=src_mask, attn_mask=src_attn_mask, return_embeddings=True)
1297
- return self.decoder.generate(seq_out_start, seq_len, context=encodings, context_mask=src_mask, **kwargs)
1298
-
1299
- def forward(self, src, tgt, src_mask=None, tgt_mask=None, src_attn_mask=None):
1300
- enc = self.encoder(src, mask=src_mask, attn_mask=src_attn_mask, return_embeddings=True)
1301
- out = self.decoder(tgt, context=enc, mask=tgt_mask, context_mask=src_mask)
1302
- return out
1253
  return tuple(res)
1254
  return res[0]
1255
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
read.py CHANGED
@@ -28,11 +28,14 @@ def split_and_recombine_text(texts, desired_length=200, max_len=300):
28
 
29
  if __name__ == '__main__':
30
  parser = argparse.ArgumentParser()
31
- parser.add_argument('--textfile', type=str, help='A file containing the text to read.', default="data/riding_hood2.txt")
32
  parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
33
  'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='patrick_stewart')
34
  parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/longform/')
35
- parser.add_argument('--generation_preset', type=str, help='Preset to use for generation', default='standard')
 
 
 
36
  args = parser.parse_args()
37
 
38
  outpath = args.output_path
@@ -60,16 +63,11 @@ if __name__ == '__main__':
60
  if not cond_paths:
61
  print('Error: no valid voices specified. Try again.')
62
 
63
- priors = []
 
 
 
64
  for j, text in enumerate(texts):
65
- conds = priors.copy()
66
- for cond_path in cond_paths:
67
- c = load_audio(cond_path, 22050)
68
- conds.append(c)
69
- gen = tts.tts_with_preset(text, conds, preset=args.generation_preset)
70
  torchaudio.save(os.path.join(voice_outpath, f'{j}.wav'), gen.squeeze(0).cpu(), 24000)
71
 
72
- priors.append(torchaudio.functional.resample(gen, 24000, 22050).squeeze(0))
73
- while len(priors) > 2:
74
- priors.pop(0)
75
-
28
 
29
  if __name__ == '__main__':
30
  parser = argparse.ArgumentParser()
31
+ parser.add_argument('--textfile', type=str, help='A file containing the text to read.', default="data/riding_hood.txt")
32
  parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
33
  'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='patrick_stewart')
34
  parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/longform/')
35
+ parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='standard')
36
+ parser.add_argument('--voice_diversity_intelligibility_slider', type=float,
37
+ help='How to balance vocal diversity with the quality/intelligibility of the spoken text. 0 means highly diverse voice (not recommended), 1 means maximize intellibility',
38
+ default=.5)
39
  args = parser.parse_args()
40
 
41
  outpath = args.output_path
63
  if not cond_paths:
64
  print('Error: no valid voices specified. Try again.')
65
 
66
+ conds = []
67
+ for cond_path in cond_paths:
68
+ c = load_audio(cond_path, 22050)
69
+ conds.append(c)
70
  for j, text in enumerate(texts):
71
+ gen = tts.tts_with_preset(text, conds, preset=args.preset, clvp_cvvp_slider=args.voice_diversity_intelligibility_slider)
 
 
 
 
72
  torchaudio.save(os.path.join(voice_outpath, f'{j}.wav'), gen.squeeze(0).cpu(), 24000)
73
 
 
 
 
 
requirements.txt CHANGED
@@ -6,5 +6,4 @@ tokenizers
6
  inflect
7
  progressbar
8
  einops
9
- unidecode
10
- x-transformers
6
  inflect
7
  progressbar
8
  einops
9
+ unidecode