jbetker commited on
Commit
39ec1b0
1 Parent(s): 7d5bb89

Support totally random voices (and make fixes to previous changes)

Browse files
.gitignore CHANGED
@@ -129,6 +129,7 @@ dmypy.json
129
  .pyre/
130
 
131
  .idea/*
132
- .models/*
 
133
  .custom/*
134
  results/*
 
129
  .pyre/
130
 
131
  .idea/*
132
+ tortoise/.models/*
133
+ tortoise/random_voices/*
134
  .custom/*
135
  results/*
tortoise/api.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
  import random
 
3
  from urllib import request
4
 
5
  import torch
@@ -15,6 +16,7 @@ from tqdm import tqdm
15
 
16
  from tortoise.models.arch_util import TorchMelSpectrogram
17
  from tortoise.models.clvp import CLVP
 
18
  from tortoise.models.vocoder import UnivNetGenerator
19
  from tortoise.utils.audio import wav_to_univnet_mel, denormalize_tacotron_mel
20
  from tortoise.utils.diffusion import SpacedDiffusion, space_timesteps, get_named_beta_schedule
@@ -161,7 +163,8 @@ class TextToSpeech:
161
  Main entry point into Tortoise.
162
  """
163
 
164
- def __init__(self, autoregressive_batch_size=16, models_dir='.models', enable_redaction=True):
 
165
  """
166
  Constructor
167
  :param autoregressive_batch_size: Specifies how many samples to generate per batch. Lower this if you are seeing
@@ -170,11 +173,15 @@ class TextToSpeech:
170
  models, otherwise use the defaults.
171
  :param enable_redaction: When true, text enclosed in brackets are automatically redacted from the spoken output
172
  (but are still rendered by the model). This can be used for prompt engineering.
 
 
 
173
  """
174
  self.autoregressive_batch_size = autoregressive_batch_size
175
  self.enable_redaction = enable_redaction
176
  if self.enable_redaction:
177
  self.aligner = Wav2VecAlignment()
 
178
 
179
  self.tokenizer = VoiceBpeTokenizer()
180
  download_models()
@@ -210,6 +217,10 @@ class TextToSpeech:
210
  self.vocoder.load_state_dict(torch.load(f'{models_dir}/vocoder.pth')['model_g'])
211
  self.vocoder.eval(inference=True)
212
 
 
 
 
 
213
  def tts_with_preset(self, text, preset='fast', **kwargs):
214
  """
215
  Calls TTS with one of a set of preset generation parameters. Options:
@@ -265,7 +276,21 @@ class TextToSpeech:
265
  diffusion_latent = self.diffusion.get_conditioning(diffusion_conds)
266
  self.diffusion = self.diffusion.cpu()
267
 
268
- return auto_latent, diffusion_latent
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
 
270
  def tts(self, text, voice_samples=None, conditioning_latents=None, k=1, verbose=True,
271
  # autoregressive generation parameters follow
@@ -323,14 +348,19 @@ class TextToSpeech:
323
  :return: Generated audio clip(s) as a torch tensor. Shape 1,S if k=1 else, (k,1,S) where S is the sample length.
324
  Sample rate is 24kHz.
325
  """
326
- text = torch.IntTensor(self.tokenizer.encode(text)).unsqueeze(0).cuda()
327
- text = F.pad(text, (0, 1)) # This may not be necessary.
328
- assert text.shape[-1] < 400, 'Too much text provided. Break the text up into separate segments and re-try inference.'
329
 
 
330
  if voice_samples is not None:
331
- auto_conditioning, diffusion_conditioning = self.get_conditioning_latents(voice_samples)
332
- else:
333
  auto_conditioning, diffusion_conditioning = conditioning_latents
 
 
 
 
334
 
335
  diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k)
336
 
@@ -343,7 +373,7 @@ class TextToSpeech:
343
  if verbose:
344
  print("Generating autoregressive samples..")
345
  for b in tqdm(range(num_batches), disable=not verbose):
346
- codes = self.autoregressive.inference_speech(auto_conditioning, text,
347
  do_sample=True,
348
  top_p=top_p,
349
  temperature=temperature,
@@ -365,12 +395,15 @@ class TextToSpeech:
365
  for batch in tqdm(samples, disable=not verbose):
366
  for i in range(batch.shape[0]):
367
  batch[i] = fix_autoregressive_output(batch[i], stop_mel_token)
368
- clvp = self.clvp(text.repeat(batch.shape[0], 1), batch, return_loss=False)
369
- cvvp_accumulator = 0
370
- for cl in range(conds.shape[1]):
371
- cvvp_accumulator = cvvp_accumulator + self.cvvp(conds[:, cl].repeat(batch.shape[0], 1, 1), batch, return_loss=False )
372
- cvvp = cvvp_accumulator / conds.shape[1]
373
- clip_results.append(clvp * clvp_cvvp_slider + cvvp * (1-clvp_cvvp_slider))
 
 
 
374
  clip_results = torch.cat(clip_results, dim=0)
375
  samples = torch.cat(samples, dim=0)
376
  best_results = samples[torch.topk(clip_results, k=k).indices]
@@ -382,8 +415,8 @@ class TextToSpeech:
382
  # inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
383
  # results, but will increase memory usage.
384
  self.autoregressive = self.autoregressive.cuda()
385
- best_latents = self.autoregressive(auto_conditioning, text, torch.tensor([text.shape[-1]], device=conds.device), best_results,
386
- torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=conds.device),
387
  return_latent=True, clip_inputs=False)
388
  self.autoregressive = self.autoregressive.cpu()
389
  del auto_conditioning
@@ -415,7 +448,7 @@ class TextToSpeech:
415
  self.diffusion = self.diffusion.cpu()
416
  self.vocoder = self.vocoder.cpu()
417
 
418
- def potentially_redact(self, clip, text):
419
  if self.enable_redaction:
420
  return self.aligner.redact(clip, text)
421
  return clip
 
1
  import os
2
  import random
3
+ import uuid
4
  from urllib import request
5
 
6
  import torch
 
16
 
17
  from tortoise.models.arch_util import TorchMelSpectrogram
18
  from tortoise.models.clvp import CLVP
19
+ from tortoise.models.random_latent_generator import RandomLatentConverter
20
  from tortoise.models.vocoder import UnivNetGenerator
21
  from tortoise.utils.audio import wav_to_univnet_mel, denormalize_tacotron_mel
22
  from tortoise.utils.diffusion import SpacedDiffusion, space_timesteps, get_named_beta_schedule
 
163
  Main entry point into Tortoise.
164
  """
165
 
166
+ def __init__(self, autoregressive_batch_size=16, models_dir='.models', enable_redaction=True,
167
+ save_random_voices=False):
168
  """
169
  Constructor
170
  :param autoregressive_batch_size: Specifies how many samples to generate per batch. Lower this if you are seeing
 
173
  models, otherwise use the defaults.
174
  :param enable_redaction: When true, text enclosed in brackets are automatically redacted from the spoken output
175
  (but are still rendered by the model). This can be used for prompt engineering.
176
+ Default is true.
177
+ :param save_random_voices: When true, voices that are randomly generated are saved to the `random_voices`
178
+ directory. Default is false.
179
  """
180
  self.autoregressive_batch_size = autoregressive_batch_size
181
  self.enable_redaction = enable_redaction
182
  if self.enable_redaction:
183
  self.aligner = Wav2VecAlignment()
184
+ self.save_random_voices = save_random_voices
185
 
186
  self.tokenizer = VoiceBpeTokenizer()
187
  download_models()
 
217
  self.vocoder.load_state_dict(torch.load(f'{models_dir}/vocoder.pth')['model_g'])
218
  self.vocoder.eval(inference=True)
219
 
220
+ # Random latent generators (RLGs) are loaded lazily.
221
+ self.rlg_auto = None
222
+ self.rlg_diffusion = None
223
+
224
  def tts_with_preset(self, text, preset='fast', **kwargs):
225
  """
226
  Calls TTS with one of a set of preset generation parameters. Options:
 
276
  diffusion_latent = self.diffusion.get_conditioning(diffusion_conds)
277
  self.diffusion = self.diffusion.cpu()
278
 
279
+ return auto_latent, diffusion_latent, auto_conds
280
+
281
+ def get_random_conditioning_latents(self):
282
+ # Lazy-load the RLG models.
283
+ if self.rlg_auto is None:
284
+ self.rlg_auto = RandomLatentConverter(1024).eval()
285
+ self.rlg_auto.load_state_dict(torch.load('.models/rlg_auto.pth', map_location=torch.device('cpu')))
286
+ self.rlg_diffusion = RandomLatentConverter(2048).eval()
287
+ self.rlg_diffusion.load_state_dict(torch.load('.models/rlg_diffuser.pth', map_location=torch.device('cpu')))
288
+ with torch.no_grad():
289
+ latents = self.rlg_auto(torch.tensor([0.0])), self.rlg_diffusion(torch.tensor([0.0]))
290
+ if self.save_random_voices:
291
+ os.makedirs('random_voices', exist_ok=True)
292
+ torch.save(latents, f'random_voices/{str(uuid.uuid4())}.pth')
293
+ return latents
294
 
295
  def tts(self, text, voice_samples=None, conditioning_latents=None, k=1, verbose=True,
296
  # autoregressive generation parameters follow
 
348
  :return: Generated audio clip(s) as a torch tensor. Shape 1,S if k=1 else, (k,1,S) where S is the sample length.
349
  Sample rate is 24kHz.
350
  """
351
+ text_tokens = torch.IntTensor(self.tokenizer.encode(text)).unsqueeze(0).cuda()
352
+ text_tokens = F.pad(text_tokens, (0, 1)) # This may not be necessary.
353
+ assert text_tokens.shape[-1] < 400, 'Too much text provided. Break the text up into separate segments and re-try inference.'
354
 
355
+ auto_conds = None
356
  if voice_samples is not None:
357
+ auto_conditioning, diffusion_conditioning, auto_conds = self.get_conditioning_latents(voice_samples)
358
+ elif conditioning_latents is not None:
359
  auto_conditioning, diffusion_conditioning = conditioning_latents
360
+ else:
361
+ auto_conditioning, diffusion_conditioning = self.get_random_conditioning_latents()
362
+ auto_conditioning = auto_conditioning.cuda()
363
+ diffusion_conditioning = diffusion_conditioning.cuda()
364
 
365
  diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k)
366
 
 
373
  if verbose:
374
  print("Generating autoregressive samples..")
375
  for b in tqdm(range(num_batches), disable=not verbose):
376
+ codes = self.autoregressive.inference_speech(auto_conditioning, text_tokens,
377
  do_sample=True,
378
  top_p=top_p,
379
  temperature=temperature,
 
395
  for batch in tqdm(samples, disable=not verbose):
396
  for i in range(batch.shape[0]):
397
  batch[i] = fix_autoregressive_output(batch[i], stop_mel_token)
398
+ clvp = self.clvp(text_tokens.repeat(batch.shape[0], 1), batch, return_loss=False)
399
+ if auto_conds is not None:
400
+ cvvp_accumulator = 0
401
+ for cl in range(auto_conds.shape[1]):
402
+ cvvp_accumulator = cvvp_accumulator + self.cvvp(auto_conds[:, cl].repeat(batch.shape[0], 1, 1), batch, return_loss=False)
403
+ cvvp = cvvp_accumulator / auto_conds.shape[1]
404
+ clip_results.append(clvp * clvp_cvvp_slider + cvvp * (1-clvp_cvvp_slider))
405
+ else:
406
+ clip_results.append(clvp)
407
  clip_results = torch.cat(clip_results, dim=0)
408
  samples = torch.cat(samples, dim=0)
409
  best_results = samples[torch.topk(clip_results, k=k).indices]
 
415
  # inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
416
  # results, but will increase memory usage.
417
  self.autoregressive = self.autoregressive.cuda()
418
+ best_latents = self.autoregressive(auto_conditioning, text_tokens, torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), best_results,
419
+ torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device),
420
  return_latent=True, clip_inputs=False)
421
  self.autoregressive = self.autoregressive.cpu()
422
  del auto_conditioning
 
448
  self.diffusion = self.diffusion.cpu()
449
  self.vocoder = self.vocoder.cpu()
450
 
451
+ def potentially_redact(clip, text):
452
  if self.enable_redaction:
453
  return self.aligner.redact(clip, text)
454
  return clip
tortoise/do_tts.py CHANGED
@@ -10,23 +10,23 @@ if __name__ == '__main__':
10
  parser = argparse.ArgumentParser()
11
  parser.add_argument('--text', type=str, help='Text to speak.', default="I am a language model that has learned to speak.")
12
  parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
13
- 'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='pat')
14
- parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='standard')
15
  parser.add_argument('--voice_diversity_intelligibility_slider', type=float,
16
  help='How to balance vocal diversity with the quality/intelligibility of the spoken text. 0 means highly diverse voice (not recommended), 1 means maximize intellibility',
17
  default=.5)
18
- parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/')
19
  parser.add_argument('--model_dir', type=str, help='Where to find pretrained model checkpoints. Tortoise automatically downloads these to .models, so this'
20
  'should only be specified if you have custom checkpoints.', default='.models')
21
  args = parser.parse_args()
22
  os.makedirs(args.output_path, exist_ok=True)
23
 
24
- tts = TextToSpeech(models_dir=args.model_dir)
25
 
26
  selected_voices = args.voice.split(',')
27
- for voice in selected_voices:
28
  voice_samples, conditioning_latents = load_voice(voice)
29
  gen = tts.tts_with_preset(args.text, voice_samples=voice_samples, conditioning_latents=conditioning_latents,
30
  preset=args.preset, clvp_cvvp_slider=args.voice_diversity_intelligibility_slider)
31
- torchaudio.save(os.path.join(args.output_path, f'{voice}.wav'), gen.squeeze(0).cpu(), 24000)
32
 
 
10
  parser = argparse.ArgumentParser()
11
  parser.add_argument('--text', type=str, help='Text to speak.', default="I am a language model that has learned to speak.")
12
  parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
13
+ 'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='random')
14
+ parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='fast')
15
  parser.add_argument('--voice_diversity_intelligibility_slider', type=float,
16
  help='How to balance vocal diversity with the quality/intelligibility of the spoken text. 0 means highly diverse voice (not recommended), 1 means maximize intellibility',
17
  default=.5)
18
+ parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='../results/')
19
  parser.add_argument('--model_dir', type=str, help='Where to find pretrained model checkpoints. Tortoise automatically downloads these to .models, so this'
20
  'should only be specified if you have custom checkpoints.', default='.models')
21
  args = parser.parse_args()
22
  os.makedirs(args.output_path, exist_ok=True)
23
 
24
+ tts = TextToSpeech(models_dir=args.model_dir, save_random_voices=True)
25
 
26
  selected_voices = args.voice.split(',')
27
+ for k, voice in enumerate(selected_voices):
28
  voice_samples, conditioning_latents = load_voice(voice)
29
  gen = tts.tts_with_preset(args.text, voice_samples=voice_samples, conditioning_latents=conditioning_latents,
30
  preset=args.preset, clvp_cvvp_slider=args.voice_diversity_intelligibility_slider)
31
+ torchaudio.save(os.path.join(args.output_path, f'{voice}_{k}.wav'), gen.squeeze(0).cpu(), 24000)
32
 
tortoise/models/autoregressive.py CHANGED
@@ -401,13 +401,13 @@ class UnifiedVoice(nn.Module):
401
  conds = conds.mean(dim=1).unsqueeze(1)
402
  return conds
403
 
404
- def forward(self, speech_conditioning_input, text_inputs, text_lengths, mel_codes, wav_lengths, types=None, text_first=True, raw_mels=None, return_attentions=False,
405
  return_latent=False, clip_inputs=True):
406
  """
407
  Forward pass that uses both text and voice in either text conditioning mode or voice conditioning mode
408
  (actuated by `text_first`).
409
 
410
- speech_conditioning_input: MEL float tensor, (b,80,s)
411
  text_inputs: long tensor, (b,t)
412
  text_lengths: long tensor, (b,)
413
  mel_inputs: long tensor, (b,m)
@@ -421,7 +421,7 @@ class UnifiedVoice(nn.Module):
421
  # Types are expressed by expanding the text embedding space.
422
  if types is not None:
423
  text_inputs = text_inputs * (1+types).unsqueeze(-1)
424
-
425
  if clip_inputs:
426
  # This model will receive micro-batches with a ton of padding for both the text and MELs. Ameliorate this by
427
  # chopping the inputs by the maximum actual length.
@@ -435,7 +435,7 @@ class UnifiedVoice(nn.Module):
435
  text_inputs = F.pad(text_inputs, (0,1), value=self.stop_text_token)
436
  mel_codes = F.pad(mel_codes, (0,1), value=self.stop_mel_token)
437
 
438
- conds = self.get_conditioning(speech_conditioning_input)
439
  text_inputs, text_targets = self.build_aligned_inputs_and_targets(text_inputs, self.start_text_token, self.stop_text_token)
440
  text_emb = self.text_embedding(text_inputs) + self.text_pos_embedding(text_inputs)
441
  mel_codes, mel_targets = self.build_aligned_inputs_and_targets(mel_codes, self.start_mel_token, self.stop_mel_token)
@@ -540,7 +540,7 @@ class UnifiedVoice(nn.Module):
540
  text_inputs, text_targets = self.build_aligned_inputs_and_targets(text_inputs, self.start_text_token, self.stop_text_token)
541
  text_emb = self.text_embedding(text_inputs) + self.text_pos_embedding(text_inputs)
542
 
543
- conds = speech_conditioning_latent
544
  emb = torch.cat([conds, text_emb], dim=1)
545
  self.inference_model.store_mel_emb(emb)
546
 
 
401
  conds = conds.mean(dim=1).unsqueeze(1)
402
  return conds
403
 
404
+ def forward(self, speech_conditioning_latent, text_inputs, text_lengths, mel_codes, wav_lengths, types=None, text_first=True, raw_mels=None, return_attentions=False,
405
  return_latent=False, clip_inputs=True):
406
  """
407
  Forward pass that uses both text and voice in either text conditioning mode or voice conditioning mode
408
  (actuated by `text_first`).
409
 
410
+ speech_conditioning_input: MEL float tensor, (b,1024)
411
  text_inputs: long tensor, (b,t)
412
  text_lengths: long tensor, (b,)
413
  mel_inputs: long tensor, (b,m)
 
421
  # Types are expressed by expanding the text embedding space.
422
  if types is not None:
423
  text_inputs = text_inputs * (1+types).unsqueeze(-1)
424
+
425
  if clip_inputs:
426
  # This model will receive micro-batches with a ton of padding for both the text and MELs. Ameliorate this by
427
  # chopping the inputs by the maximum actual length.
 
435
  text_inputs = F.pad(text_inputs, (0,1), value=self.stop_text_token)
436
  mel_codes = F.pad(mel_codes, (0,1), value=self.stop_mel_token)
437
 
438
+ conds = speech_conditioning_latent.unsqueeze(1)
439
  text_inputs, text_targets = self.build_aligned_inputs_and_targets(text_inputs, self.start_text_token, self.stop_text_token)
440
  text_emb = self.text_embedding(text_inputs) + self.text_pos_embedding(text_inputs)
441
  mel_codes, mel_targets = self.build_aligned_inputs_and_targets(mel_codes, self.start_mel_token, self.stop_mel_token)
 
540
  text_inputs, text_targets = self.build_aligned_inputs_and_targets(text_inputs, self.start_text_token, self.stop_text_token)
541
  text_emb = self.text_embedding(text_inputs) + self.text_pos_embedding(text_inputs)
542
 
543
+ conds = speech_conditioning_latent.unsqueeze(1)
544
  emb = torch.cat([conds, text_emb], dim=1)
545
  self.inference_model.store_mel_emb(emb)
546
 
tortoise/models/diffusion_decoder.py CHANGED
@@ -226,6 +226,7 @@ class DiffusionTts(nn.Module):
226
  for j in range(speech_conditioning_input.shape[1]):
227
  conds.append(self.contextual_embedder(speech_conditioning_input[:, j]))
228
  conds = torch.cat(conds, dim=-1)
 
229
  return conds
230
 
231
  def timestep_independent(self, aligned_conditioning, conditioning_latent, expected_seq_len, return_code_pred):
@@ -233,9 +234,7 @@ class DiffusionTts(nn.Module):
233
  if is_latent(aligned_conditioning):
234
  aligned_conditioning = aligned_conditioning.permute(0, 2, 1)
235
 
236
- conds = conditioning_latent
237
- cond_emb = conds.mean(dim=-1)
238
- cond_scale, cond_shift = torch.chunk(cond_emb, 2, dim=1)
239
  if is_latent(aligned_conditioning):
240
  code_emb = self.latent_conditioner(aligned_conditioning)
241
  else:
 
226
  for j in range(speech_conditioning_input.shape[1]):
227
  conds.append(self.contextual_embedder(speech_conditioning_input[:, j]))
228
  conds = torch.cat(conds, dim=-1)
229
+ conds = conds.mean(dim=-1)
230
  return conds
231
 
232
  def timestep_independent(self, aligned_conditioning, conditioning_latent, expected_seq_len, return_code_pred):
 
234
  if is_latent(aligned_conditioning):
235
  aligned_conditioning = aligned_conditioning.permute(0, 2, 1)
236
 
237
+ cond_scale, cond_shift = torch.chunk(conditioning_latent, 2, dim=1)
 
 
238
  if is_latent(aligned_conditioning):
239
  code_emb = self.latent_conditioner(aligned_conditioning)
240
  else:
tortoise/models/random_latent_generator.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.nn.functional as F
6
+
7
+
8
+ def fused_leaky_relu(input, bias=None, negative_slope=0.2, scale=2 ** 0.5):
9
+ if bias is not None:
10
+ rest_dim = [1] * (input.ndim - bias.ndim - 1)
11
+ return (
12
+ F.leaky_relu(
13
+ input + bias.view(1, bias.shape[0], *rest_dim), negative_slope=negative_slope
14
+ )
15
+ * scale
16
+ )
17
+ else:
18
+ return F.leaky_relu(input, negative_slope=0.2) * scale
19
+
20
+
21
+ class EqualLinear(nn.Module):
22
+ def __init__(
23
+ self, in_dim, out_dim, bias=True, bias_init=0, lr_mul=1
24
+ ):
25
+ super().__init__()
26
+ self.weight = nn.Parameter(torch.randn(out_dim, in_dim).div_(lr_mul))
27
+ if bias:
28
+ self.bias = nn.Parameter(torch.zeros(out_dim).fill_(bias_init))
29
+ else:
30
+ self.bias = None
31
+ self.scale = (1 / math.sqrt(in_dim)) * lr_mul
32
+ self.lr_mul = lr_mul
33
+
34
+ def forward(self, input):
35
+ out = F.linear(input, self.weight * self.scale)
36
+ out = fused_leaky_relu(out, self.bias * self.lr_mul)
37
+ return out
38
+
39
+
40
+ class RandomLatentConverter(nn.Module):
41
+ def __init__(self, channels):
42
+ super().__init__()
43
+ self.layers = nn.Sequential(*[EqualLinear(channels, channels, lr_mul=.1) for _ in range(5)],
44
+ nn.Linear(channels, channels))
45
+ self.channels = channels
46
+
47
+ def forward(self, ref):
48
+ r = torch.randn(ref.shape[0], self.channels, device=ref.device)
49
+ y = self.layers(r)
50
+ return y
51
+
52
+
53
+ if __name__ == '__main__':
54
+ model = RandomLatentConverter(512)
55
+ model(torch.randn(5,512))
tortoise/read.py CHANGED
@@ -31,7 +31,7 @@ if __name__ == '__main__':
31
  parser.add_argument('--textfile', type=str, help='A file containing the text to read.', default="data/riding_hood.txt")
32
  parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
33
  'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='pat')
34
- parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/longform/')
35
  parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='standard')
36
  parser.add_argument('--regenerate', type=str, help='Comma-separated list of clip numbers to re-generate, or nothing.', default=None)
37
  parser.add_argument('--voice_diversity_intelligibility_slider', type=float,
@@ -40,7 +40,7 @@ if __name__ == '__main__':
40
  parser.add_argument('--model_dir', type=str, help='Where to find pretrained model checkpoints. Tortoise automatically downloads these to .models, so this'
41
  'should only be specified if you have custom checkpoints.', default='.models')
42
  args = parser.parse_args()
43
- tts = TextToSpeech(models_dir=args.model_dir)
44
 
45
  outpath = args.output_path
46
  selected_voices = args.voice.split(',')
 
31
  parser.add_argument('--textfile', type=str, help='A file containing the text to read.', default="data/riding_hood.txt")
32
  parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
33
  'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='pat')
34
+ parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='../results/longform/')
35
  parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='standard')
36
  parser.add_argument('--regenerate', type=str, help='Comma-separated list of clip numbers to re-generate, or nothing.', default=None)
37
  parser.add_argument('--voice_diversity_intelligibility_slider', type=float,
 
40
  parser.add_argument('--model_dir', type=str, help='Where to find pretrained model checkpoints. Tortoise automatically downloads these to .models, so this'
41
  'should only be specified if you have custom checkpoints.', default='.models')
42
  args = parser.parse_args()
43
+ tts = TextToSpeech(models_dir=args.model_dir, save_random_voices=True)
44
 
45
  outpath = args.output_path
46
  selected_voices = args.voice.split(',')
tortoise/utils/audio.py CHANGED
@@ -92,6 +92,9 @@ def get_voices():
92
 
93
 
94
  def load_voice(voice):
 
 
 
95
  voices = get_voices()
96
  paths = voices[voice]
97
  if len(paths) == 1 and paths[0].endswith('.pth'):
 
92
 
93
 
94
  def load_voice(voice):
95
+ if voice == 'random':
96
+ return None, None
97
+
98
  voices = get_voices()
99
  paths = voices[voice]
100
  if len(paths) == 1 and paths[0].endswith('.pth'):