jbetker commited on
Commit
cdf44d7
1 Parent(s): 5a95c34

more fixes

Browse files
tortoise/api.py CHANGED
@@ -194,8 +194,7 @@ class TextToSpeech:
194
  self.autoregressive = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30,
195
  model_dim=1024,
196
  heads=16, number_text_tokens=255, start_text_token=255, checkpointing=False,
197
- train_solo_embeddings=False,
198
- average_conditioning_embeddings=True).cpu().eval()
199
  self.autoregressive.load_state_dict(torch.load(f'{models_dir}/autoregressive.pth'))
200
 
201
  self.diffusion = DiffusionTts(model_channels=1024, num_layers=10, in_channels=100, out_channels=200,
@@ -244,7 +243,7 @@ class TextToSpeech:
244
  kwargs.update(presets[preset])
245
  return self.tts(text, **kwargs)
246
 
247
- def get_conditioning_latents(self, voice_samples):
248
  """
249
  Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
250
  These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic
@@ -268,7 +267,7 @@ class TextToSpeech:
268
  # The diffuser operates at a sample rate of 24000 (except for the latent inputs)
269
  sample = torchaudio.functional.resample(sample, 22050, 24000)
270
  sample = pad_or_truncate(sample, 102400)
271
- cond_mel = wav_to_univnet_mel(sample.to(voice_samples.device), do_normalization=False)
272
  diffusion_conds.append(cond_mel)
273
  diffusion_conds = torch.stack(diffusion_conds, dim=1)
274
 
@@ -276,7 +275,10 @@ class TextToSpeech:
276
  diffusion_latent = self.diffusion.get_conditioning(diffusion_conds)
277
  self.diffusion = self.diffusion.cpu()
278
 
279
- return auto_latent, diffusion_latent, auto_conds
 
 
 
280
 
281
  def get_random_conditioning_latents(self):
282
  # Lazy-load the RLG models.
@@ -295,7 +297,6 @@ class TextToSpeech:
295
  def tts(self, text, voice_samples=None, conditioning_latents=None, k=1, verbose=True,
296
  # autoregressive generation parameters follow
297
  num_autoregressive_samples=512, temperature=.8, length_penalty=1, repetition_penalty=2.0, top_p=.8, max_mel_tokens=500,
298
- typical_sampling=False, typical_mass=.9,
299
  # CLVP & CVVP parameters
300
  clvp_cvvp_slider=.5,
301
  # diffusion generation parameters follow
@@ -354,13 +355,13 @@ class TextToSpeech:
354
 
355
  auto_conds = None
356
  if voice_samples is not None:
357
- auto_conditioning, diffusion_conditioning, auto_conds = self.get_conditioning_latents(voice_samples)
358
  elif conditioning_latents is not None:
359
  auto_conditioning, diffusion_conditioning = conditioning_latents
360
  else:
361
  auto_conditioning, diffusion_conditioning = self.get_random_conditioning_latents()
362
- auto_conditioning = auto_conditioning.cuda()
363
- diffusion_conditioning = diffusion_conditioning.cuda()
364
 
365
  diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k)
366
 
 
194
  self.autoregressive = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30,
195
  model_dim=1024,
196
  heads=16, number_text_tokens=255, start_text_token=255, checkpointing=False,
197
+ train_solo_embeddings=False).cpu().eval()
 
198
  self.autoregressive.load_state_dict(torch.load(f'{models_dir}/autoregressive.pth'))
199
 
200
  self.diffusion = DiffusionTts(model_channels=1024, num_layers=10, in_channels=100, out_channels=200,
 
243
  kwargs.update(presets[preset])
244
  return self.tts(text, **kwargs)
245
 
246
+ def get_conditioning_latents(self, voice_samples, return_mels=False):
247
  """
248
  Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
249
  These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic
 
267
  # The diffuser operates at a sample rate of 24000 (except for the latent inputs)
268
  sample = torchaudio.functional.resample(sample, 22050, 24000)
269
  sample = pad_or_truncate(sample, 102400)
270
+ cond_mel = wav_to_univnet_mel(sample.to('cuda'), do_normalization=False)
271
  diffusion_conds.append(cond_mel)
272
  diffusion_conds = torch.stack(diffusion_conds, dim=1)
273
 
 
275
  diffusion_latent = self.diffusion.get_conditioning(diffusion_conds)
276
  self.diffusion = self.diffusion.cpu()
277
 
278
+ if return_mels:
279
+ return auto_latent, diffusion_latent, auto_conds, diffusion_conds
280
+ else:
281
+ return auto_latent, diffusion_latent
282
 
283
  def get_random_conditioning_latents(self):
284
  # Lazy-load the RLG models.
 
297
  def tts(self, text, voice_samples=None, conditioning_latents=None, k=1, verbose=True,
298
  # autoregressive generation parameters follow
299
  num_autoregressive_samples=512, temperature=.8, length_penalty=1, repetition_penalty=2.0, top_p=.8, max_mel_tokens=500,
 
300
  # CLVP & CVVP parameters
301
  clvp_cvvp_slider=.5,
302
  # diffusion generation parameters follow
 
355
 
356
  auto_conds = None
357
  if voice_samples is not None:
358
+ auto_conditioning, diffusion_conditioning, auto_conds, _ = self.get_conditioning_latents(voice_samples, return_mels=True)
359
  elif conditioning_latents is not None:
360
  auto_conditioning, diffusion_conditioning = conditioning_latents
361
  else:
362
  auto_conditioning, diffusion_conditioning = self.get_random_conditioning_latents()
363
+ auto_conditioning = auto_conditioning.cuda()
364
+ diffusion_conditioning = diffusion_conditioning.cuda()
365
 
366
  diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k)
367
 
tortoise/get_conditioning_latents.py CHANGED
@@ -11,8 +11,8 @@ other ML models, or can be augmented manually and fed back into Tortoise to affe
11
  """
12
  if __name__ == '__main__':
13
  parser = argparse.ArgumentParser()
14
- parser.add_argument('--voice', type=str, help='Selects the voice to convert to conditioning latents', default='pat')
15
- parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/conditioning_latents')
16
  args = parser.parse_args()
17
  os.makedirs(args.output_path, exist_ok=True)
18
 
 
11
  """
12
  if __name__ == '__main__':
13
  parser = argparse.ArgumentParser()
14
+ parser.add_argument('--voice', type=str, help='Selects the voice to convert to conditioning latents', default='pat2')
15
+ parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='../results/conditioning_latents')
16
  args = parser.parse_args()
17
  os.makedirs(args.output_path, exist_ok=True)
18
 
tortoise/models/autoregressive.py CHANGED
@@ -280,8 +280,7 @@ class UnifiedVoice(nn.Module):
280
  mel_length_compression=1024, number_text_tokens=256,
281
  start_text_token=None, number_mel_codes=8194, start_mel_token=8192,
282
  stop_mel_token=8193, train_solo_embeddings=False, use_mel_codes_as_input=True,
283
- checkpointing=True, average_conditioning_embeddings=False,
284
- types=1):
285
  """
286
  Args:
287
  layers: Number of layers in transformer stack.
@@ -300,7 +299,6 @@ class UnifiedVoice(nn.Module):
300
  train_solo_embeddings:
301
  use_mel_codes_as_input:
302
  checkpointing:
303
- average_conditioning_embeddings: Whether or not conditioning embeddings should be averaged, instead of fed piecewise into the model.
304
  """
305
  super().__init__()
306
 
@@ -318,7 +316,6 @@ class UnifiedVoice(nn.Module):
318
  self.max_conditioning_inputs = max_conditioning_inputs
319
  self.mel_length_compression = mel_length_compression
320
  self.conditioning_encoder = ConditioningEncoder(80, model_dim, num_attn_heads=heads)
321
- self.average_conditioning_embeddings = average_conditioning_embeddings
322
  self.text_embedding = nn.Embedding(self.number_text_tokens*types+1, model_dim)
323
  if use_mel_codes_as_input:
324
  self.mel_embedding = nn.Embedding(self.number_mel_codes, model_dim)
@@ -397,8 +394,7 @@ class UnifiedVoice(nn.Module):
397
  for j in range(speech_conditioning_input.shape[1]):
398
  conds.append(self.conditioning_encoder(speech_conditioning_input[:, j]))
399
  conds = torch.stack(conds, dim=1)
400
- if self.average_conditioning_embeddings:
401
- conds = conds.mean(dim=1).unsqueeze(1)
402
  return conds
403
 
404
  def forward(self, speech_conditioning_latent, text_inputs, text_lengths, mel_codes, wav_lengths, types=None, text_first=True, raw_mels=None, return_attentions=False,
@@ -461,65 +457,6 @@ class UnifiedVoice(nn.Module):
461
  loss_mel = F.cross_entropy(mel_logits, mel_targets.long())
462
  return loss_text.mean(), loss_mel.mean(), mel_logits
463
 
464
- def text_forward(self, speech_conditioning_input, text_inputs, text_lengths):
465
- """
466
- Performs autoregressive modeling on only text. Still requires a speech_conditioning_input due to the way the
467
- model inputs are formatted. Just provide any audio clip (arguably, zeros could be provided).
468
- """
469
- assert self.max_text_tokens >= text_inputs.shape[1], f'{text_inputs.shape[1]}'
470
-
471
- # This model will receive micro-batches with a ton of padding for both the text and MELs. Ameliorate this by
472
- # chopping the inputs by the maximum actual length.
473
- max_text_len = text_lengths.max()
474
- text_inputs = F.pad(text_inputs[:, :max_text_len], (0,1), value=self.stop_text_token)
475
-
476
- speech_conditioning_input = speech_conditioning_input.unsqueeze(1) if len(speech_conditioning_input.shape) == 3 else speech_conditioning_input
477
- conds = []
478
- for j in range(speech_conditioning_input.shape[1]):
479
- conds.append(self.conditioning_encoder(speech_conditioning_input[:, j]))
480
- conds = torch.stack(conds, dim=1)
481
- if self.average_conditioning_embeddings:
482
- conds = conds.mean(dim=1).unsqueeze(1)
483
-
484
- text_inputs, text_targets = self.build_aligned_inputs_and_targets(text_inputs, self.start_text_token, self.stop_text_token)
485
- text_emb = self.text_embedding(text_inputs) + self.text_pos_embedding(text_inputs) + self.text_solo_embedding
486
- text_logits = self.get_logits(conds, text_emb, self.text_head)
487
- loss_text = F.cross_entropy(text_logits, text_targets.long())
488
- return loss_text.mean()
489
-
490
- def speech_forward(self, speech_conditioning_input, mel_codes, wav_lengths, raw_mels=None):
491
- """
492
- Performs autoregressive modeling on only speech data.
493
- """
494
- assert self.max_mel_tokens >= mel_codes.shape[1], f'{mel_codes.shape[1]}'
495
-
496
- # This model will receive micro-batches with a ton of padding for both the text and MELs. Ameliorate this by
497
- # chopping the inputs by the maximum actual length.
498
- max_mel_len = wav_lengths.max() // self.mel_length_compression
499
- mel_codes = F.pad(mel_codes[:, :max_mel_len], (0,1), value=self.stop_mel_token)
500
- mel_codes = self.set_mel_padding(mel_codes, wav_lengths)
501
- if raw_mels is not None:
502
- raw_mels = raw_mels[:, :, :max_mel_len*4]
503
-
504
- speech_conditioning_input = speech_conditioning_input.unsqueeze(1) if len(speech_conditioning_input.shape) == 3 else speech_conditioning_input
505
- conds = []
506
- for j in range(speech_conditioning_input.shape[1]):
507
- conds.append(self.conditioning_encoder(speech_conditioning_input[:, j]))
508
- conds = torch.stack(conds, dim=1)
509
- if self.average_conditioning_embeddings:
510
- conds = conds.mean(dim=1).unsqueeze(1)
511
-
512
- mel_codes, mel_targets = self.build_aligned_inputs_and_targets(mel_codes, self.start_mel_token, self.stop_mel_token)
513
- if raw_mels is not None:
514
- mel_inp = F.pad(raw_mels, (0, 4))
515
- else:
516
- mel_inp = mel_codes
517
- mel_emb = self.mel_embedding(mel_inp)
518
- mel_emb = mel_emb + self.mel_pos_embedding(mel_codes) + self.mel_solo_embedding
519
- mel_logits = self.get_logits(conds, mel_emb, self.mel_head)
520
- loss_mel = F.cross_entropy(mel_logits, mel_targets.long())
521
- return loss_mel.mean()
522
-
523
  def inference_speech(self, speech_conditioning_latent, text_inputs, input_tokens=None, num_return_sequences=1,
524
  max_generate_length=None, typical_sampling=False, typical_mass=.9, **hf_generate_kwargs):
525
  seq_length = self.max_mel_tokens + self.max_text_tokens + 2
 
280
  mel_length_compression=1024, number_text_tokens=256,
281
  start_text_token=None, number_mel_codes=8194, start_mel_token=8192,
282
  stop_mel_token=8193, train_solo_embeddings=False, use_mel_codes_as_input=True,
283
+ checkpointing=True, types=1):
 
284
  """
285
  Args:
286
  layers: Number of layers in transformer stack.
 
299
  train_solo_embeddings:
300
  use_mel_codes_as_input:
301
  checkpointing:
 
302
  """
303
  super().__init__()
304
 
 
316
  self.max_conditioning_inputs = max_conditioning_inputs
317
  self.mel_length_compression = mel_length_compression
318
  self.conditioning_encoder = ConditioningEncoder(80, model_dim, num_attn_heads=heads)
 
319
  self.text_embedding = nn.Embedding(self.number_text_tokens*types+1, model_dim)
320
  if use_mel_codes_as_input:
321
  self.mel_embedding = nn.Embedding(self.number_mel_codes, model_dim)
 
394
  for j in range(speech_conditioning_input.shape[1]):
395
  conds.append(self.conditioning_encoder(speech_conditioning_input[:, j]))
396
  conds = torch.stack(conds, dim=1)
397
+ conds = conds.mean(dim=1)
 
398
  return conds
399
 
400
  def forward(self, speech_conditioning_latent, text_inputs, text_lengths, mel_codes, wav_lengths, types=None, text_first=True, raw_mels=None, return_attentions=False,
 
457
  loss_mel = F.cross_entropy(mel_logits, mel_targets.long())
458
  return loss_text.mean(), loss_mel.mean(), mel_logits
459
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
460
  def inference_speech(self, speech_conditioning_latent, text_inputs, input_tokens=None, num_return_sequences=1,
461
  max_generate_length=None, typical_sampling=False, typical_mass=.9, **hf_generate_kwargs):
462
  seq_length = self.max_mel_tokens + self.max_text_tokens + 2
tortoise/utils/audio.py CHANGED
@@ -87,7 +87,7 @@ def get_voices():
87
  for sub in subs:
88
  subj = os.path.join('voices', sub)
89
  if os.path.isdir(subj):
90
- voices[sub] = list(glob(f'{subj}/*.wav')) + list(glob(f'{subj}/*.mp3'))
91
  return voices
92
 
93
 
@@ -111,6 +111,9 @@ def load_voices(voices):
111
  latents = []
112
  clips = []
113
  for voice in voices:
 
 
 
114
  latent, clip = load_voice(voice)
115
  if latent is None:
116
  assert len(latents) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
@@ -119,10 +122,10 @@ def load_voices(voices):
119
  assert len(voices) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
120
  latents.append(latent)
121
  if len(latents) == 0:
122
- return clips
123
  else:
124
  latents = torch.stack(latents, dim=0)
125
- return latents.mean(dim=0)
126
 
127
 
128
  class TacotronSTFT(torch.nn.Module):
 
87
  for sub in subs:
88
  subj = os.path.join('voices', sub)
89
  if os.path.isdir(subj):
90
+ voices[sub] = list(glob(f'{subj}/*.wav')) + list(glob(f'{subj}/*.mp3')) + list(glob(f'{subj}/*.pth'))
91
  return voices
92
 
93
 
 
111
  latents = []
112
  clips = []
113
  for voice in voices:
114
+ if voice == 'random':
115
+ print("Cannot combine a random voice with a non-random voice. Just using a random voice.")
116
+ return None, None
117
  latent, clip = load_voice(voice)
118
  if latent is None:
119
  assert len(latents) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
 
122
  assert len(voices) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
123
  latents.append(latent)
124
  if len(latents) == 0:
125
+ return clips, None
126
  else:
127
  latents = torch.stack(latents, dim=0)
128
+ return None, latents.mean(dim=0)
129
 
130
 
131
  class TacotronSTFT(torch.nn.Module):