jbetker commited on
Commit
0ffc191
1 Parent(s): a8264f5

Add support for extracting and feeding conditioning latents directly into the model

Browse files

- Adds a new script and API endpoints for doing this
- Reworks autoregressive and diffusion models so that the conditioning is computed separately (which will actually provide a mild performance boost)
- Updates README

This is untested. Need to do the following manual tests (and someday write unit tests for this behemoth before
it becomes a problem..)
1) Does get_conditioning_latents.py work?
2) Can I feed those latents back into the model by creating a new voice?
3) Can I still mix and match voices (both with conditioning latents and normal voices) with read.py?

README.md CHANGED
@@ -118,12 +118,24 @@ These settings are not available in the normal scripts packaged with Tortoise. T
118
 
119
  ### Playing with the voice latent
120
 
121
- Tortoise ingests reference clips by feeding them through individually through a small submodel that produces a point latent, then taking the mean of all of the produced latents. The experimentation I have done has indicated that these point latents are quite expressive, affecting
122
- everything from tone to speaking rate to speech abnormalities.
 
123
 
124
- This lends itself to some neat tricks. For example, you can combine feed two different voices to tortoise and it will output what it thinks the "average" of those two voices sounds like. You could also theoretically build a small extension to Tortoise that gradually shifts the
125
- latent from one speaker to another, then apply it across a bit of spoken text (something I havent implemented yet, but might
126
- get to soon!) I am sure there are other interesting things that can be done here. Please let me know what you find!
 
 
 
 
 
 
 
 
 
 
 
127
 
128
  ### Send me feedback!
129
 
 
118
 
119
  ### Playing with the voice latent
120
 
121
+ Tortoise ingests reference clips by feeding them through individually through a small submodel that produces a point latent,
122
+ then taking the mean of all of the produced latents. The experimentation I have done has indicated that these point latents
123
+ are quite expressive, affecting everything from tone to speaking rate to speech abnormalities.
124
 
125
+ This lends itself to some neat tricks. For example, you can combine feed two different voices to tortoise and it will output
126
+ what it thinks the "average" of those two voices sounds like.
127
+
128
+ #### Generating conditioning latents from voices
129
+
130
+ Use the script `get_conditioning_latents.py` to extract conditioning latents for a voice you have installed. This script
131
+ will dump the latents to a .pth pickle file. The file will contain a single tuple, (autoregressive_latent, diffusion_latent).
132
+
133
+ Alternatively, use the api.TextToSpeech.get_conditioning_latents() to fetch the latents.
134
+
135
+ #### Using raw conditioning latents to generate speech
136
+
137
+ After you've played with them, you can use them to generate speech by creating a subdirectory in voices/ with a single
138
+ ".pth" file containing the pickled conditioning latents as a tuple (autoregressive_latent, diffusion_latent).
139
 
140
  ### Send me feedback!
141
 
tortoise/api.py CHANGED
@@ -121,23 +121,14 @@ def fix_autoregressive_output(codes, stop_token, complain=True):
121
  return codes
122
 
123
 
124
- def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_samples, temperature=1, verbose=True):
125
  """
126
  Uses the specified diffusion model to convert discrete codes into a spectrogram.
127
  """
128
  with torch.no_grad():
129
- cond_mels = []
130
- for sample in conditioning_samples:
131
- # The diffuser operates at a sample rate of 24000 (except for the latent inputs)
132
- sample = torchaudio.functional.resample(sample, 22050, 24000)
133
- sample = pad_or_truncate(sample, 102400)
134
- cond_mel = wav_to_univnet_mel(sample.to(latents.device), do_normalization=False)
135
- cond_mels.append(cond_mel)
136
- cond_mels = torch.stack(cond_mels, dim=1)
137
-
138
  output_seq_len = latents.shape[1] * 4 * 24000 // 22050 # This diffusion model converts from 22kHz spectrogram codes to a 24kHz spectrogram signal.
139
  output_shape = (latents.shape[0], 100, output_seq_len)
140
- precomputed_embeddings = diffusion_model.timestep_independent(latents, cond_mels, output_seq_len, False)
141
 
142
  noise = torch.randn(output_shape, device=latents.device) * temperature
143
  mel = diffuser.p_sample_loop(diffusion_model, output_shape, noise=noise,
@@ -204,7 +195,7 @@ class TextToSpeech:
204
  self.vocoder.load_state_dict(torch.load(f'{models_dir}/vocoder.pth')['model_g'])
205
  self.vocoder.eval(inference=True)
206
 
207
- def tts_with_preset(self, text, voice_samples, preset='fast', **kwargs):
208
  """
209
  Calls TTS with one of a set of preset generation parameters. Options:
210
  'ultra_fast': Produces speech at a speed which belies the name of this repo. (Not really, but it's definitely fastest).
@@ -225,9 +216,43 @@ class TextToSpeech:
225
  'high_quality': {'num_autoregressive_samples': 512, 'diffusion_iterations': 1024},
226
  }
227
  kwargs.update(presets[preset])
228
- return self.tts(text, voice_samples, **kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
 
230
- def tts(self, text, voice_samples, k=1, verbose=True,
 
 
231
  # autoregressive generation parameters follow
232
  num_autoregressive_samples=512, temperature=.8, length_penalty=1, repetition_penalty=2.0, top_p=.8, max_mel_tokens=500,
233
  typical_sampling=False, typical_mass=.9,
@@ -240,6 +265,9 @@ class TextToSpeech:
240
  Produces an audio clip of the given text being spoken with the given reference voice.
241
  :param text: Text to be spoken.
242
  :param voice_samples: List of 2 or more ~10 second reference clips which should be torch tensors containing 22.05kHz waveform data.
 
 
 
243
  :param k: The number of returned clips. The most likely (as determined by Tortoises' CLVP and CVVP models) clips are returned.
244
  :param verbose: Whether or not to print log messages indicating the progress of creating a clip. Default=true.
245
  ~~AUTOREGRESSIVE KNOBS~~
@@ -283,12 +311,10 @@ class TextToSpeech:
283
  text = torch.IntTensor(self.tokenizer.encode(text)).unsqueeze(0).cuda()
284
  text = F.pad(text, (0, 1)) # This may not be necessary.
285
 
286
- conds = []
287
- if not isinstance(voice_samples, list):
288
- voice_samples = [voice_samples]
289
- for vs in voice_samples:
290
- conds.append(format_conditioning(vs))
291
- conds = torch.stack(conds, dim=1)
292
 
293
  diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k)
294
 
@@ -301,7 +327,7 @@ class TextToSpeech:
301
  if verbose:
302
  print("Generating autoregressive samples..")
303
  for b in tqdm(range(num_batches), disable=not verbose):
304
- codes = self.autoregressive.inference_speech(conds, text,
305
  do_sample=True,
306
  top_p=top_p,
307
  temperature=temperature,
@@ -340,16 +366,18 @@ class TextToSpeech:
340
  # inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
341
  # results, but will increase memory usage.
342
  self.autoregressive = self.autoregressive.cuda()
343
- best_latents = self.autoregressive(conds, text, torch.tensor([text.shape[-1]], device=conds.device), best_results,
344
  torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=conds.device),
345
  return_latent=True, clip_inputs=False)
346
  self.autoregressive = self.autoregressive.cpu()
 
347
 
348
  if verbose:
349
  print("Transforming autoregressive outputs into audio..")
350
  wav_candidates = []
351
  self.diffusion = self.diffusion.cuda()
352
  self.vocoder = self.vocoder.cuda()
 
353
  for b in range(best_results.shape[0]):
354
  codes = best_results[b].unsqueeze(0)
355
  latents = best_latents[b].unsqueeze(0)
@@ -365,7 +393,8 @@ class TextToSpeech:
365
  latents = latents[:, :k]
366
  break
367
 
368
- mel = do_spectrogram_diffusion(self.diffusion, diffuser, latents, voice_samples, temperature=diffusion_temperature, verbose=verbose)
 
369
  wav = self.vocoder.inference(mel)
370
  wav_candidates.append(wav.cpu())
371
  self.diffusion = self.diffusion.cpu()
 
121
  return codes
122
 
123
 
124
+ def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_latents, temperature=1, verbose=True):
125
  """
126
  Uses the specified diffusion model to convert discrete codes into a spectrogram.
127
  """
128
  with torch.no_grad():
 
 
 
 
 
 
 
 
 
129
  output_seq_len = latents.shape[1] * 4 * 24000 // 22050 # This diffusion model converts from 22kHz spectrogram codes to a 24kHz spectrogram signal.
130
  output_shape = (latents.shape[0], 100, output_seq_len)
131
+ precomputed_embeddings = diffusion_model.timestep_independent(latents, conditioning_latents, output_seq_len, False)
132
 
133
  noise = torch.randn(output_shape, device=latents.device) * temperature
134
  mel = diffuser.p_sample_loop(diffusion_model, output_shape, noise=noise,
 
195
  self.vocoder.load_state_dict(torch.load(f'{models_dir}/vocoder.pth')['model_g'])
196
  self.vocoder.eval(inference=True)
197
 
198
+ def tts_with_preset(self, text, preset='fast', **kwargs):
199
  """
200
  Calls TTS with one of a set of preset generation parameters. Options:
201
  'ultra_fast': Produces speech at a speed which belies the name of this repo. (Not really, but it's definitely fastest).
 
216
  'high_quality': {'num_autoregressive_samples': 512, 'diffusion_iterations': 1024},
217
  }
218
  kwargs.update(presets[preset])
219
+ return self.tts(text, **kwargs)
220
+
221
+ def get_conditioning_latents(self, voice_samples):
222
+ """
223
+ Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
224
+ These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic
225
+ properties.
226
+ :param voice_samples: List of 2 or more ~10 second reference clips, which should be torch tensors containing 22.05kHz waveform data.
227
+ """
228
+ voice_samples = [v.to('cuda') for v in voice_samples]
229
+
230
+ auto_conds = []
231
+ if not isinstance(voice_samples, list):
232
+ voice_samples = [voice_samples]
233
+ for vs in voice_samples:
234
+ auto_conds.append(format_conditioning(vs))
235
+ auto_conds = torch.stack(auto_conds, dim=1)
236
+ self.autoregressive = self.autoregressive.cuda()
237
+ auto_latent = self.autoregressive.get_conditioning(auto_conds)
238
+ self.autoregressive = self.autoregressive.cpu()
239
+
240
+ diffusion_conds = []
241
+ for sample in voice_samples:
242
+ # The diffuser operates at a sample rate of 24000 (except for the latent inputs)
243
+ sample = torchaudio.functional.resample(sample, 22050, 24000)
244
+ sample = pad_or_truncate(sample, 102400)
245
+ cond_mel = wav_to_univnet_mel(sample.to(voice_samples.device), do_normalization=False)
246
+ diffusion_conds.append(cond_mel)
247
+ diffusion_conds = torch.stack(diffusion_conds, dim=1)
248
+
249
+ self.diffusion = self.diffusion.cuda()
250
+ diffusion_latent = self.diffusion.get_conditioning(diffusion_conds)
251
+ self.diffusion = self.diffusion.cpu()
252
 
253
+ return auto_latent, diffusion_latent
254
+
255
+ def tts(self, text, voice_samples=None, conditioning_latents=None, k=1, verbose=True,
256
  # autoregressive generation parameters follow
257
  num_autoregressive_samples=512, temperature=.8, length_penalty=1, repetition_penalty=2.0, top_p=.8, max_mel_tokens=500,
258
  typical_sampling=False, typical_mass=.9,
 
265
  Produces an audio clip of the given text being spoken with the given reference voice.
266
  :param text: Text to be spoken.
267
  :param voice_samples: List of 2 or more ~10 second reference clips which should be torch tensors containing 22.05kHz waveform data.
268
+ :param conditioning_latents: A tuple of (autoregressive_conditioning_latent, diffusion_conditioning_latent), which
269
+ can be provided in lieu of voice_samples. This is ignored unless voice_samples=None.
270
+ Conditioning latents can be retrieved via get_conditioning_latents().
271
  :param k: The number of returned clips. The most likely (as determined by Tortoises' CLVP and CVVP models) clips are returned.
272
  :param verbose: Whether or not to print log messages indicating the progress of creating a clip. Default=true.
273
  ~~AUTOREGRESSIVE KNOBS~~
 
311
  text = torch.IntTensor(self.tokenizer.encode(text)).unsqueeze(0).cuda()
312
  text = F.pad(text, (0, 1)) # This may not be necessary.
313
 
314
+ if voice_samples is not None:
315
+ auto_conditioning, diffusion_conditioning = self.get_conditioning_latents(voice_samples)
316
+ else:
317
+ auto_conditioning, diffusion_conditioning = conditioning_latents
 
 
318
 
319
  diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k)
320
 
 
327
  if verbose:
328
  print("Generating autoregressive samples..")
329
  for b in tqdm(range(num_batches), disable=not verbose):
330
+ codes = self.autoregressive.inference_speech(auto_conditioning, text,
331
  do_sample=True,
332
  top_p=top_p,
333
  temperature=temperature,
 
366
  # inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
367
  # results, but will increase memory usage.
368
  self.autoregressive = self.autoregressive.cuda()
369
+ best_latents = self.autoregressive(auto_conditioning, text, torch.tensor([text.shape[-1]], device=conds.device), best_results,
370
  torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=conds.device),
371
  return_latent=True, clip_inputs=False)
372
  self.autoregressive = self.autoregressive.cpu()
373
+ del auto_conditioning
374
 
375
  if verbose:
376
  print("Transforming autoregressive outputs into audio..")
377
  wav_candidates = []
378
  self.diffusion = self.diffusion.cuda()
379
  self.vocoder = self.vocoder.cuda()
380
+ diffusion_conds =
381
  for b in range(best_results.shape[0]):
382
  codes = best_results[b].unsqueeze(0)
383
  latents = best_latents[b].unsqueeze(0)
 
393
  latents = latents[:, :k]
394
  break
395
 
396
+ mel = do_spectrogram_diffusion(self.diffusion, diffuser, latents, diffusion_conditioning,
397
+ temperature=diffusion_temperature, verbose=verbose)
398
  wav = self.vocoder.inference(mel)
399
  wav_candidates.append(wav.cpu())
400
  self.diffusion = self.diffusion.cpu()
tortoise/do_tts.py CHANGED
@@ -4,7 +4,7 @@ import os
4
  import torchaudio
5
 
6
  from api import TextToSpeech
7
- from tortoise.utils.audio import load_audio, get_voices
8
 
9
  if __name__ == '__main__':
10
  parser = argparse.ArgumentParser()
@@ -21,14 +21,10 @@ if __name__ == '__main__':
21
 
22
  tts = TextToSpeech()
23
 
24
- voices = get_voices()
25
  selected_voices = args.voice.split(',')
26
  for voice in selected_voices:
27
- cond_paths = voices[voice]
28
- conds = []
29
- for cond_path in cond_paths:
30
- c = load_audio(cond_path, 22050)
31
- conds.append(c)
32
- gen = tts.tts_with_preset(args.text, conds, preset=args.preset, clvp_cvvp_slider=args.voice_diversity_intelligibility_slider)
33
  torchaudio.save(os.path.join(args.output_path, f'{voice}.wav'), gen.squeeze(0).cpu(), 24000)
34
 
 
4
  import torchaudio
5
 
6
  from api import TextToSpeech
7
+ from tortoise.utils.audio import load_audio, get_voices, load_voice
8
 
9
  if __name__ == '__main__':
10
  parser = argparse.ArgumentParser()
 
21
 
22
  tts = TextToSpeech()
23
 
 
24
  selected_voices = args.voice.split(',')
25
  for voice in selected_voices:
26
+ voice_samples, conditioning_latents = load_voice(voice)
27
+ gen = tts.tts_with_preset(args.text, voice_samples=voice_samples, conditioning_latents=conditioning_latents,
28
+ preset=args.preset, clvp_cvvp_slider=args.voice_diversity_intelligibility_slider)
 
 
 
29
  torchaudio.save(os.path.join(args.output_path, f'{voice}.wav'), gen.squeeze(0).cpu(), 24000)
30
 
tortoise/get_conditioning_latents.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import torch
4
+
5
+ from api import TextToSpeech
6
+ from tortoise.utils.audio import load_audio, get_voices
7
+
8
+ """
9
+ Dumps the conditioning latents for the specified voice to disk. These are expressive latents which can be used for
10
+ other ML models, or can be augmented manually and fed back into Tortoise to affect vocal qualities.
11
+ """
12
+ if __name__ == '__main__':
13
+ parser = argparse.ArgumentParser()
14
+ parser.add_argument('--voice', type=str, help='Selects the voice to convert to conditioning latents', default='pat')
15
+ parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/conditioning_latents')
16
+ args = parser.parse_args()
17
+ os.makedirs(args.output_path, exist_ok=True)
18
+
19
+ tts = TextToSpeech()
20
+ voices = get_voices()
21
+ selected_voices = args.voice.split(',')
22
+ for voice in selected_voices:
23
+ cond_paths = voices[voice]
24
+ conds = []
25
+ for cond_path in cond_paths:
26
+ c = load_audio(cond_path, 22050)
27
+ conds.append(c)
28
+ conditioning_latents = tts.get_conditioning_latents(conds)
29
+ torch.save(conditioning_latents, os.path.join(args.output_path, f'{voice}.pth'))
30
+
tortoise/models/autoregressive.py CHANGED
@@ -390,6 +390,17 @@ class UnifiedVoice(nn.Module):
390
  else:
391
  return first_logits
392
 
 
 
 
 
 
 
 
 
 
 
 
393
  def forward(self, speech_conditioning_input, text_inputs, text_lengths, mel_codes, wav_lengths, types=None, text_first=True, raw_mels=None, return_attentions=False,
394
  return_latent=False, clip_inputs=True):
395
  """
@@ -424,14 +435,7 @@ class UnifiedVoice(nn.Module):
424
  text_inputs = F.pad(text_inputs, (0,1), value=self.stop_text_token)
425
  mel_codes = F.pad(mel_codes, (0,1), value=self.stop_mel_token)
426
 
427
- speech_conditioning_input = speech_conditioning_input.unsqueeze(1) if len(speech_conditioning_input.shape) == 3 else speech_conditioning_input
428
- conds = []
429
- for j in range(speech_conditioning_input.shape[1]):
430
- conds.append(self.conditioning_encoder(speech_conditioning_input[:, j]))
431
- conds = torch.stack(conds, dim=1)
432
- if self.average_conditioning_embeddings:
433
- conds = conds.mean(dim=1).unsqueeze(1)
434
-
435
  text_inputs, text_targets = self.build_aligned_inputs_and_targets(text_inputs, self.start_text_token, self.stop_text_token)
436
  text_emb = self.text_embedding(text_inputs) + self.text_pos_embedding(text_inputs)
437
  mel_codes, mel_targets = self.build_aligned_inputs_and_targets(mel_codes, self.start_mel_token, self.stop_mel_token)
@@ -516,7 +520,7 @@ class UnifiedVoice(nn.Module):
516
  loss_mel = F.cross_entropy(mel_logits, mel_targets.long())
517
  return loss_mel.mean()
518
 
519
- def inference_speech(self, speech_conditioning_input, text_inputs, input_tokens=None, num_return_sequences=1,
520
  max_generate_length=None, typical_sampling=False, typical_mass=.9, **hf_generate_kwargs):
521
  seq_length = self.max_mel_tokens + self.max_text_tokens + 2
522
  if not hasattr(self, 'inference_model'):
@@ -536,14 +540,7 @@ class UnifiedVoice(nn.Module):
536
  text_inputs, text_targets = self.build_aligned_inputs_and_targets(text_inputs, self.start_text_token, self.stop_text_token)
537
  text_emb = self.text_embedding(text_inputs) + self.text_pos_embedding(text_inputs)
538
 
539
- speech_conditioning_input = speech_conditioning_input.unsqueeze(1) if len(speech_conditioning_input.shape) == 3 else speech_conditioning_input
540
- conds = []
541
- for j in range(speech_conditioning_input.shape[1]):
542
- conds.append(self.conditioning_encoder(speech_conditioning_input[:, j]))
543
- conds = torch.stack(conds, dim=1)
544
- if self.average_conditioning_embeddings:
545
- conds = conds.mean(dim=1).unsqueeze(1)
546
-
547
  emb = torch.cat([conds, text_emb], dim=1)
548
  self.inference_model.store_mel_emb(emb)
549
 
 
390
  else:
391
  return first_logits
392
 
393
+ def get_conditioning(self, speech_conditioning_input):
394
+ speech_conditioning_input = speech_conditioning_input.unsqueeze(1) if len(
395
+ speech_conditioning_input.shape) == 3 else speech_conditioning_input
396
+ conds = []
397
+ for j in range(speech_conditioning_input.shape[1]):
398
+ conds.append(self.conditioning_encoder(speech_conditioning_input[:, j]))
399
+ conds = torch.stack(conds, dim=1)
400
+ if self.average_conditioning_embeddings:
401
+ conds = conds.mean(dim=1).unsqueeze(1)
402
+ return conds
403
+
404
  def forward(self, speech_conditioning_input, text_inputs, text_lengths, mel_codes, wav_lengths, types=None, text_first=True, raw_mels=None, return_attentions=False,
405
  return_latent=False, clip_inputs=True):
406
  """
 
435
  text_inputs = F.pad(text_inputs, (0,1), value=self.stop_text_token)
436
  mel_codes = F.pad(mel_codes, (0,1), value=self.stop_mel_token)
437
 
438
+ conds = self.get_conditioning(speech_conditioning_input)
 
 
 
 
 
 
 
439
  text_inputs, text_targets = self.build_aligned_inputs_and_targets(text_inputs, self.start_text_token, self.stop_text_token)
440
  text_emb = self.text_embedding(text_inputs) + self.text_pos_embedding(text_inputs)
441
  mel_codes, mel_targets = self.build_aligned_inputs_and_targets(mel_codes, self.start_mel_token, self.stop_mel_token)
 
520
  loss_mel = F.cross_entropy(mel_logits, mel_targets.long())
521
  return loss_mel.mean()
522
 
523
+ def inference_speech(self, speech_conditioning_latent, text_inputs, input_tokens=None, num_return_sequences=1,
524
  max_generate_length=None, typical_sampling=False, typical_mass=.9, **hf_generate_kwargs):
525
  seq_length = self.max_mel_tokens + self.max_text_tokens + 2
526
  if not hasattr(self, 'inference_model'):
 
540
  text_inputs, text_targets = self.build_aligned_inputs_and_targets(text_inputs, self.start_text_token, self.stop_text_token)
541
  text_emb = self.text_embedding(text_inputs) + self.text_pos_embedding(text_inputs)
542
 
543
+ conds = speech_conditioning_latent
 
 
 
 
 
 
 
544
  emb = torch.cat([conds, text_emb], dim=1)
545
  self.inference_model.store_mel_emb(emb)
546
 
tortoise/models/diffusion_decoder.py CHANGED
@@ -219,18 +219,21 @@ class DiffusionTts(nn.Module):
219
  }
220
  return groups
221
 
222
- def timestep_independent(self, aligned_conditioning, conditioning_input, expected_seq_len, return_code_pred):
223
- # Shuffle aligned_latent to BxCxS format
224
- if is_latent(aligned_conditioning):
225
- aligned_conditioning = aligned_conditioning.permute(0, 2, 1)
226
-
227
- # Note: this block does not need to repeated on inference, since it is not timestep-dependent or x-dependent.
228
  speech_conditioning_input = conditioning_input.unsqueeze(1) if len(
229
  conditioning_input.shape) == 3 else conditioning_input
230
  conds = []
231
  for j in range(speech_conditioning_input.shape[1]):
232
  conds.append(self.contextual_embedder(speech_conditioning_input[:, j]))
233
  conds = torch.cat(conds, dim=-1)
 
 
 
 
 
 
 
 
234
  cond_emb = conds.mean(dim=-1)
235
  cond_scale, cond_shift = torch.chunk(cond_emb, 2, dim=1)
236
  if is_latent(aligned_conditioning):
@@ -257,19 +260,19 @@ class DiffusionTts(nn.Module):
257
  mel_pred = mel_pred * unconditioned_batches.logical_not()
258
  return expanded_code_emb, mel_pred
259
 
260
- def forward(self, x, timesteps, aligned_conditioning=None, conditioning_input=None, precomputed_aligned_embeddings=None, conditioning_free=False, return_code_pred=False):
261
  """
262
  Apply the model to an input batch.
263
 
264
  :param x: an [N x C x ...] Tensor of inputs.
265
  :param timesteps: a 1-D batch of timesteps.
266
  :param aligned_conditioning: an aligned latent or sequence of tokens providing useful data about the sample to be produced.
267
- :param conditioning_input: a full-resolution audio clip that is used as a reference to the style you want decoded.
268
  :param precomputed_aligned_embeddings: Embeddings returned from self.timestep_independent()
269
  :param conditioning_free: When set, all conditioning inputs (including tokens and conditioning_input) will not be considered.
270
  :return: an [N x C x ...] Tensor of outputs.
271
  """
272
- assert precomputed_aligned_embeddings is not None or (aligned_conditioning is not None and conditioning_input is not None)
273
  assert not (return_code_pred and precomputed_aligned_embeddings is not None) # These two are mutually exclusive.
274
 
275
  unused_params = []
@@ -281,7 +284,7 @@ class DiffusionTts(nn.Module):
281
  if precomputed_aligned_embeddings is not None:
282
  code_emb = precomputed_aligned_embeddings
283
  else:
284
- code_emb, mel_pred = self.timestep_independent(aligned_conditioning, conditioning_input, x.shape[-1], True)
285
  if is_latent(aligned_conditioning):
286
  unused_params.extend(list(self.code_converter.parameters()) + list(self.code_embedding.parameters()))
287
  else:
 
219
  }
220
  return groups
221
 
222
+ def get_conditioning(self, conditioning_input):
 
 
 
 
 
223
  speech_conditioning_input = conditioning_input.unsqueeze(1) if len(
224
  conditioning_input.shape) == 3 else conditioning_input
225
  conds = []
226
  for j in range(speech_conditioning_input.shape[1]):
227
  conds.append(self.contextual_embedder(speech_conditioning_input[:, j]))
228
  conds = torch.cat(conds, dim=-1)
229
+ return conds
230
+
231
+ def timestep_independent(self, aligned_conditioning, conditioning_latent, expected_seq_len, return_code_pred):
232
+ # Shuffle aligned_latent to BxCxS format
233
+ if is_latent(aligned_conditioning):
234
+ aligned_conditioning = aligned_conditioning.permute(0, 2, 1)
235
+
236
+ conds = conditioning_latent
237
  cond_emb = conds.mean(dim=-1)
238
  cond_scale, cond_shift = torch.chunk(cond_emb, 2, dim=1)
239
  if is_latent(aligned_conditioning):
 
260
  mel_pred = mel_pred * unconditioned_batches.logical_not()
261
  return expanded_code_emb, mel_pred
262
 
263
+ def forward(self, x, timesteps, aligned_conditioning=None, conditioning_latent=None, precomputed_aligned_embeddings=None, conditioning_free=False, return_code_pred=False):
264
  """
265
  Apply the model to an input batch.
266
 
267
  :param x: an [N x C x ...] Tensor of inputs.
268
  :param timesteps: a 1-D batch of timesteps.
269
  :param aligned_conditioning: an aligned latent or sequence of tokens providing useful data about the sample to be produced.
270
+ :param conditioning_latent: a pre-computed conditioning latent; see get_conditioning().
271
  :param precomputed_aligned_embeddings: Embeddings returned from self.timestep_independent()
272
  :param conditioning_free: When set, all conditioning inputs (including tokens and conditioning_input) will not be considered.
273
  :return: an [N x C x ...] Tensor of outputs.
274
  """
275
+ assert precomputed_aligned_embeddings is not None or (aligned_conditioning is not None and conditioning_latent is not None)
276
  assert not (return_code_pred and precomputed_aligned_embeddings is not None) # These two are mutually exclusive.
277
 
278
  unused_params = []
 
284
  if precomputed_aligned_embeddings is not None:
285
  code_emb = precomputed_aligned_embeddings
286
  else:
287
+ code_emb, mel_pred = self.timestep_independent(aligned_conditioning, conditioning_latent, x.shape[-1], True)
288
  if is_latent(aligned_conditioning):
289
  unused_params.extend(list(self.code_converter.parameters()) + list(self.code_embedding.parameters()))
290
  else:
tortoise/read.py CHANGED
@@ -5,7 +5,7 @@ import torch
5
  import torchaudio
6
 
7
  from api import TextToSpeech
8
- from tortoise.utils.audio import load_audio, get_voices
9
 
10
 
11
  def split_and_recombine_text(texts, desired_length=200, max_len=300):
@@ -40,7 +40,6 @@ if __name__ == '__main__':
40
  args = parser.parse_args()
41
 
42
  outpath = args.output_path
43
- voices = get_voices()
44
  selected_voices = args.voice.split(',')
45
  regenerate = args.regenerate
46
  if regenerate is not None:
@@ -58,25 +57,15 @@ if __name__ == '__main__':
58
  voice_sel = selected_voice.split('&')
59
  else:
60
  voice_sel = [selected_voice]
61
- cond_paths = []
62
- for vsel in voice_sel:
63
- if vsel not in voices.keys():
64
- print(f'Error: voice {vsel} not available. Skipping.')
65
- continue
66
- cond_paths.extend(voices[vsel])
67
- if not cond_paths:
68
- print('Error: no valid voices specified. Try again.')
69
 
70
- conds = []
71
- for cond_path in cond_paths:
72
- c = load_audio(cond_path, 22050)
73
- conds.append(c)
74
  all_parts = []
75
  for j, text in enumerate(texts):
76
  if regenerate is not None and j not in regenerate:
77
  all_parts.append(load_audio(os.path.join(voice_outpath, f'{j}.wav'), 24000))
78
  continue
79
- gen = tts.tts_with_preset(text, conds, preset=args.preset, clvp_cvvp_slider=args.voice_diversity_intelligibility_slider)
 
80
  gen = gen.squeeze(0).cpu()
81
  torchaudio.save(os.path.join(voice_outpath, f'{j}.wav'), gen, 24000)
82
  all_parts.append(gen)
 
5
  import torchaudio
6
 
7
  from api import TextToSpeech
8
+ from tortoise.utils.audio import load_audio, get_voices, load_voices
9
 
10
 
11
  def split_and_recombine_text(texts, desired_length=200, max_len=300):
 
40
  args = parser.parse_args()
41
 
42
  outpath = args.output_path
 
43
  selected_voices = args.voice.split(',')
44
  regenerate = args.regenerate
45
  if regenerate is not None:
 
57
  voice_sel = selected_voice.split('&')
58
  else:
59
  voice_sel = [selected_voice]
 
 
 
 
 
 
 
 
60
 
61
+ voice_samples, conditioning_latents = load_voices(voice_sel)
 
 
 
62
  all_parts = []
63
  for j, text in enumerate(texts):
64
  if regenerate is not None and j not in regenerate:
65
  all_parts.append(load_audio(os.path.join(voice_outpath, f'{j}.wav'), 24000))
66
  continue
67
+ gen = tts.tts_with_preset(text, voice_samples=voice_samples, conditioning_latents=conditioning_latents,
68
+ preset=args.preset, clvp_cvvp_slider=args.voice_diversity_intelligibility_slider)
69
  gen = gen.squeeze(0).cpu()
70
  torchaudio.save(os.path.join(voice_outpath, f'{j}.wav'), gen, 24000)
71
  all_parts.append(gen)
tortoise/utils/audio.py CHANGED
@@ -91,6 +91,37 @@ def get_voices():
91
  return voices
92
 
93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  class TacotronSTFT(torch.nn.Module):
95
  def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
96
  n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,
 
91
  return voices
92
 
93
 
94
+ def load_voice(voice):
95
+ voices = get_voices()
96
+ paths = voices[voice]
97
+ if len(paths) == 1 and paths[0].endswith('.pth'):
98
+ return None, torch.load(paths[0])
99
+ else:
100
+ conds = []
101
+ for cond_path in paths:
102
+ c = load_audio(cond_path, 22050)
103
+ conds.append(c)
104
+ return conds, None
105
+
106
+
107
+ def load_voices(voices):
108
+ latents = []
109
+ clips = []
110
+ for voice in voices:
111
+ latent, clip = load_voice(voice)
112
+ if latent is None:
113
+ assert len(latents) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
114
+ clips.extend(clip)
115
+ elif voice is None:
116
+ assert len(voices) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
117
+ latents.append(latent)
118
+ if len(latents) == 0:
119
+ return clips
120
+ else:
121
+ latents = torch.stack(latents, dim=0)
122
+ return latents.mean(dim=0)
123
+
124
+
125
  class TacotronSTFT(torch.nn.Module):
126
  def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
127
  n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,