Manmay commited on
Commit
1370e35
1 Parent(s): 30f76f3

remove clvp for lower gpu usage and increased speed.

Browse files
Files changed (2) hide show
  1. app.py +3 -15
  2. tortoise/api.py +42 -180
app.py CHANGED
@@ -40,7 +40,6 @@ VOICE_OPTIONS = [
40
  "william",
41
  "jane_eyre",
42
  "random", # special option for random voice
43
- "disabled", # special option for disabled voice
44
  ]
45
 
46
 
@@ -49,7 +48,6 @@ def inference(
49
  script,
50
  voice,
51
  voice_b,
52
- preset,
53
  seed,
54
  split_by_newline,
55
  ):
@@ -81,7 +79,7 @@ def inference(
81
  text,
82
  voice_samples=voice_samples,
83
  conditioning_latents=conditioning_latents,
84
- preset=preset,
85
  k=1,
86
  use_deterministic_seed=seed,
87
  )
@@ -91,12 +89,9 @@ def inference(
91
 
92
  full_audio = torch.cat(all_parts, dim=-1)
93
 
94
- # os.makedirs("outputs", exist_ok=True)
95
- # torchaudio.save(os.path.join("outputs", f"{name}.wav"), full_audio, 24000)
96
-
97
  with open("Tortoise_TTS_Runs_Scripts.log", "a") as f:
98
  f.write(
99
- f"{datetime.now()} | Voice: {','.join(voices)} | Text: {text} | Quality: {preset} | Time Taken (s): {time.time()-start_time} | Seed: {seed}\n"
100
  )
101
 
102
  output_texts = [f"({j+1}) {texts[j]}" for j in range(len(texts))]
@@ -120,14 +115,8 @@ def main():
120
  )
121
  script = gr.File(label="Upload a text file")
122
 
123
- preset = gr.Radio(
124
- ["ultra_fast", "fast", "standard", "high_quality"],
125
- value="fast",
126
- label="Preset mode (determines quality with tradeoff over speed):",
127
- type="value",
128
- )
129
  voice = gr.Dropdown(
130
- VOICE_OPTIONS, value="angie", label="Select voice:", type="value"
131
  )
132
  voice_b = gr.Dropdown(
133
  VOICE_OPTIONS,
@@ -154,7 +143,6 @@ def main():
154
  script,
155
  voice,
156
  voice_b,
157
- preset,
158
  seed,
159
  split_by_newline,
160
  ],
 
40
  "william",
41
  "jane_eyre",
42
  "random", # special option for random voice
 
43
  ]
44
 
45
 
 
48
  script,
49
  voice,
50
  voice_b,
 
51
  seed,
52
  split_by_newline,
53
  ):
 
79
  text,
80
  voice_samples=voice_samples,
81
  conditioning_latents=conditioning_latents,
82
+ preset="ultra_fast",
83
  k=1,
84
  use_deterministic_seed=seed,
85
  )
 
89
 
90
  full_audio = torch.cat(all_parts, dim=-1)
91
 
 
 
 
92
  with open("Tortoise_TTS_Runs_Scripts.log", "a") as f:
93
  f.write(
94
+ f"{datetime.now()} | Voice: {','.join(voices)} | Text: {text} | Time Taken (s): {time.time()-start_time} | Seed: {seed}\n"
95
  )
96
 
97
  output_texts = [f"({j+1}) {texts[j]}" for j in range(len(texts))]
 
115
  )
116
  script = gr.File(label="Upload a text file")
117
 
 
 
 
 
 
 
118
  voice = gr.Dropdown(
119
+ VOICE_OPTIONS, value="jane_eyre", label="Select voice:", type="value"
120
  )
121
  voice_b = gr.Dropdown(
122
  VOICE_OPTIONS,
 
143
  script,
144
  voice,
145
  voice_b,
 
146
  seed,
147
  split_by_newline,
148
  ],
tortoise/api.py CHANGED
@@ -252,13 +252,6 @@ class TextToSpeech:
252
  layer_drop=0, unconditioned_percentage=0).cpu().eval()
253
  self.diffusion.load_state_dict(torch.load(get_model_path('diffusion_decoder.pth', models_dir)))
254
 
255
- self.clvp = CLVP(dim_text=768, dim_speech=768, dim_latent=768, num_text_tokens=256, text_enc_depth=20,
256
- text_seq_len=350, text_heads=12,
257
- num_speech_tokens=8192, speech_enc_depth=20, speech_heads=12, speech_seq_len=430,
258
- use_xformers=True).cpu().eval()
259
- self.clvp.load_state_dict(torch.load(get_model_path('clvp2.pth', models_dir)))
260
- self.cvvp = None # CVVP model is only loaded if used.
261
-
262
  self.vocoder = UnivNetGenerator().cpu()
263
  self.vocoder.load_state_dict(torch.load(get_model_path('vocoder.pth', models_dir), map_location=torch.device('cpu'))['model_g'])
264
  self.vocoder.eval(inference=True)
@@ -272,13 +265,6 @@ class TextToSpeech:
272
  yield m
273
  m = model.cpu()
274
 
275
-
276
- def load_cvvp(self):
277
- """Load CVVP model."""
278
- self.cvvp = CVVP(model_dim=512, transformer_heads=8, dropout=0, mel_codes=8192, conditioning_enc_depth=8, cond_mask_percentage=0,
279
- speech_enc_depth=8, speech_mask_percentage=0, latent_multiplier=1).cpu().eval()
280
- self.cvvp.load_state_dict(torch.load(get_model_path('cvvp.pth', self.models_dir)))
281
-
282
  def get_conditioning_latents(self, voice_samples, return_mels=False):
283
  """
284
  Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
@@ -341,8 +327,9 @@ class TextToSpeech:
341
  'cond_free_k': 2.0, 'diffusion_temperature': 1.0}
342
  # Presets are defined here.
343
  presets = {
344
- 'ultra_fast': {'num_autoregressive_samples': 16, 'diffusion_iterations': 30, 'cond_free': False},
345
- 'fast': {'num_autoregressive_samples': 96, 'diffusion_iterations': 80},
 
346
  'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 200},
347
  'high_quality': {'num_autoregressive_samples': 256, 'diffusion_iterations': 400},
348
  }
@@ -422,182 +409,57 @@ class TextToSpeech:
422
  diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k)
423
 
424
  with torch.no_grad():
425
- samples = []
426
- num_batches = num_autoregressive_samples // self.autoregressive_batch_size
427
  stop_mel_token = self.autoregressive.stop_mel_token
428
  calm_token = 83 # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output"
429
  if verbose:
430
  print("Generating autoregressive samples..")
431
- if not torch.backends.mps.is_available():
432
- with self.temporary_cuda(self.autoregressive
433
- ) as autoregressive, torch.autocast(device_type="cuda", dtype=torch.float16, enabled=self.half):
434
- for b in tqdm(range(num_batches), disable=not verbose):
435
- codes = autoregressive.inference_speech(auto_conditioning, text_tokens,
436
- do_sample=True,
437
- top_p=top_p,
438
- temperature=temperature,
439
- num_return_sequences=self.autoregressive_batch_size,
440
- length_penalty=length_penalty,
441
- repetition_penalty=repetition_penalty,
442
- max_generate_length=max_mel_tokens,
443
- **hf_generate_kwargs)
444
- padding_needed = max_mel_tokens - codes.shape[1]
445
- codes = F.pad(codes, (0, padding_needed), value=stop_mel_token)
446
- samples.append(codes)
447
- else:
448
- with self.temporary_cuda(self.autoregressive) as autoregressive:
449
- for b in tqdm(range(num_batches), disable=not verbose):
450
- codes = autoregressive.inference_speech(auto_conditioning, text_tokens,
451
- do_sample=True,
452
- top_p=top_p,
453
- temperature=temperature,
454
- num_return_sequences=self.autoregressive_batch_size,
455
- length_penalty=length_penalty,
456
- repetition_penalty=repetition_penalty,
457
- max_generate_length=max_mel_tokens,
458
- **hf_generate_kwargs)
459
- padding_needed = max_mel_tokens - codes.shape[1]
460
- codes = F.pad(codes, (0, padding_needed), value=stop_mel_token)
461
- samples.append(codes)
462
-
463
- clip_results = []
464
-
465
- if not torch.backends.mps.is_available():
466
- with self.temporary_cuda(self.clvp) as clvp, torch.autocast(
467
- device_type="cuda" if not torch.backends.mps.is_available() else 'mps', dtype=torch.float16, enabled=self.half
468
- ):
469
- if cvvp_amount > 0:
470
- if self.cvvp is None:
471
- self.load_cvvp()
472
- self.cvvp = self.cvvp.to(self.device)
473
- if verbose:
474
- if self.cvvp is None:
475
- print("Computing best candidates using CLVP")
476
- else:
477
- print(f"Computing best candidates using CLVP {((1-cvvp_amount) * 100):2.0f}% and CVVP {(cvvp_amount * 100):2.0f}%")
478
- for batch in tqdm(samples, disable=not verbose):
479
- for i in range(batch.shape[0]):
480
- batch[i] = fix_autoregressive_output(batch[i], stop_mel_token)
481
- if cvvp_amount != 1:
482
- clvp_out = clvp(text_tokens.repeat(batch.shape[0], 1), batch, return_loss=False)
483
- if auto_conds is not None and cvvp_amount > 0:
484
- cvvp_accumulator = 0
485
- for cl in range(auto_conds.shape[1]):
486
- cvvp_accumulator = cvvp_accumulator + self.cvvp(auto_conds[:, cl].repeat(batch.shape[0], 1, 1), batch, return_loss=False)
487
- cvvp = cvvp_accumulator / auto_conds.shape[1]
488
- if cvvp_amount == 1:
489
- clip_results.append(cvvp)
490
- else:
491
- clip_results.append(cvvp * cvvp_amount + clvp_out * (1-cvvp_amount))
492
- else:
493
- clip_results.append(clvp_out)
494
- clip_results = torch.cat(clip_results, dim=0)
495
- samples = torch.cat(samples, dim=0)
496
- best_results = samples[torch.topk(clip_results, k=k).indices]
497
- else:
498
- with self.temporary_cuda(self.clvp) as clvp:
499
- if cvvp_amount > 0:
500
- if self.cvvp is None:
501
- self.load_cvvp()
502
- self.cvvp = self.cvvp.to(self.device)
503
- if verbose:
504
- if self.cvvp is None:
505
- print("Computing best candidates using CLVP")
506
- else:
507
- print(f"Computing best candidates using CLVP {((1-cvvp_amount) * 100):2.0f}% and CVVP {(cvvp_amount * 100):2.0f}%")
508
- for batch in tqdm(samples, disable=not verbose):
509
- for i in range(batch.shape[0]):
510
- batch[i] = fix_autoregressive_output(batch[i], stop_mel_token)
511
- if cvvp_amount != 1:
512
- clvp_out = clvp(text_tokens.repeat(batch.shape[0], 1), batch, return_loss=False)
513
- if auto_conds is not None and cvvp_amount > 0:
514
- cvvp_accumulator = 0
515
- for cl in range(auto_conds.shape[1]):
516
- cvvp_accumulator = cvvp_accumulator + self.cvvp(auto_conds[:, cl].repeat(batch.shape[0], 1, 1), batch, return_loss=False)
517
- cvvp = cvvp_accumulator / auto_conds.shape[1]
518
- if cvvp_amount == 1:
519
- clip_results.append(cvvp)
520
- else:
521
- clip_results.append(cvvp * cvvp_amount + clvp_out * (1-cvvp_amount))
522
- else:
523
- clip_results.append(clvp_out)
524
- clip_results = torch.cat(clip_results, dim=0)
525
- samples = torch.cat(samples, dim=0)
526
- best_results = samples[torch.topk(clip_results, k=k).indices]
527
- if self.cvvp is not None:
528
- self.cvvp = self.cvvp.cpu()
529
- del samples
530
-
531
  # The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
532
  # inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
533
  # results, but will increase memory usage.
534
- if not torch.backends.mps.is_available():
535
- with self.temporary_cuda(
536
- self.autoregressive
537
- ) as autoregressive, torch.autocast(
538
- device_type="cuda" if not torch.backends.mps.is_available() else 'mps', dtype=torch.float16, enabled=self.half
539
- ):
540
- best_latents = autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1),
541
- torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), best_results,
542
- torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device),
543
- return_latent=True, clip_inputs=False)
544
- del auto_conditioning
545
- else:
546
- with self.temporary_cuda(
547
- self.autoregressive
548
- ) as autoregressive:
549
- best_latents = autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1),
550
- torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), best_results,
551
- torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device),
552
- return_latent=True, clip_inputs=False)
553
- del auto_conditioning
554
 
555
  if verbose:
556
  print("Transforming autoregressive outputs into audio..")
557
  wav_candidates = []
558
- if not torch.backends.mps.is_available():
559
- with self.temporary_cuda(self.diffusion) as diffusion, self.temporary_cuda(
560
- self.vocoder
561
- ) as vocoder:
562
- for b in range(best_results.shape[0]):
563
- codes = best_results[b].unsqueeze(0)
564
- latents = best_latents[b].unsqueeze(0)
565
-
566
- # Find the first occurrence of the "calm" token and trim the codes to that.
 
567
  ctokens = 0
568
- for k in range(codes.shape[-1]):
569
- if codes[0, k] == calm_token:
570
- ctokens += 1
571
- else:
572
- ctokens = 0
573
- if ctokens > 8: # 8 tokens gives the diffusion model some "breathing room" to terminate speech.
574
- latents = latents[:, :k]
575
- break
576
- mel = do_spectrogram_diffusion(diffusion, diffuser, latents, diffusion_conditioning, temperature=diffusion_temperature,
577
- verbose=verbose)
578
- wav = vocoder.inference(mel)
579
- wav_candidates.append(wav.cpu())
580
- else:
581
- diffusion, vocoder = self.diffusion, self.vocoder
582
- diffusion_conditioning = diffusion_conditioning.cpu()
583
- for b in range(best_results.shape[0]):
584
- codes = best_results[b].unsqueeze(0).cpu()
585
- latents = best_latents[b].unsqueeze(0).cpu()
586
-
587
- # Find the first occurrence of the "calm" token and trim the codes to that.
588
- ctokens = 0
589
- for k in range(codes.shape[-1]):
590
- if codes[0, k] == calm_token:
591
- ctokens += 1
592
- else:
593
- ctokens = 0
594
- if ctokens > 8: # 8 tokens gives the diffusion model some "breathing room" to terminate speech.
595
- latents = latents[:, :k]
596
- break
597
- mel = do_spectrogram_diffusion(diffusion, diffuser, latents, diffusion_conditioning, temperature=diffusion_temperature,
598
- verbose=verbose)
599
- wav = vocoder.inference(mel)
600
- wav_candidates.append(wav.cpu())
601
 
602
  def potentially_redact(clip, text):
603
  if self.enable_redaction:
 
252
  layer_drop=0, unconditioned_percentage=0).cpu().eval()
253
  self.diffusion.load_state_dict(torch.load(get_model_path('diffusion_decoder.pth', models_dir)))
254
 
 
 
 
 
 
 
 
255
  self.vocoder = UnivNetGenerator().cpu()
256
  self.vocoder.load_state_dict(torch.load(get_model_path('vocoder.pth', models_dir), map_location=torch.device('cpu'))['model_g'])
257
  self.vocoder.eval(inference=True)
 
265
  yield m
266
  m = model.cpu()
267
 
 
 
 
 
 
 
 
268
  def get_conditioning_latents(self, voice_samples, return_mels=False):
269
  """
270
  Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
 
327
  'cond_free_k': 2.0, 'diffusion_temperature': 1.0}
328
  # Presets are defined here.
329
  presets = {
330
+ 'ultra_fast': {'num_autoregressive_samples': 1, 'diffusion_iterations': 15},
331
+ # 'ultra_fast': {'num_autoregressive_samples': 16, 'diffusion_iterations': 30},
332
+ 'fast': {'num_autoregressive_samples': 32, 'diffusion_iterations': 50},
333
  'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 200},
334
  'high_quality': {'num_autoregressive_samples': 256, 'diffusion_iterations': 400},
335
  }
 
409
  diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k)
410
 
411
  with torch.no_grad():
412
+
 
413
  stop_mel_token = self.autoregressive.stop_mel_token
414
  calm_token = 83 # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output"
415
  if verbose:
416
  print("Generating autoregressive samples..")
417
+ with self.temporary_cuda(self.autoregressive
418
+ ) as autoregressive, torch.autocast(device_type="cuda", dtype=torch.float16, enabled=self.half):
419
+ codes = autoregressive.inference_speech(auto_conditioning, text_tokens,
420
+ do_sample=True,
421
+ top_p=top_p,
422
+ temperature=temperature,
423
+ num_return_sequences=num_autoregressive_samples,
424
+ length_penalty=length_penalty,
425
+ repetition_penalty=repetition_penalty,
426
+ max_generate_length=max_mel_tokens,
427
+ **hf_generate_kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
428
  # The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
429
  # inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
430
  # results, but will increase memory usage.
431
+ with self.temporary_cuda(
432
+ self.autoregressive
433
+ ) as autoregressive, torch.autocast(
434
+ device_type="cuda" if not torch.backends.mps.is_available() else 'mps', dtype=torch.float16, enabled=self.half
435
+ ):
436
+ best_latents = autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1),
437
+ torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), codes,
438
+ torch.tensor([codes.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device),
439
+ return_latent=True, clip_inputs=False)
440
+ del auto_conditioning
 
 
 
 
 
 
 
 
 
 
441
 
442
  if verbose:
443
  print("Transforming autoregressive outputs into audio..")
444
  wav_candidates = []
445
+ with self.temporary_cuda(self.diffusion) as diffusion, self.temporary_cuda(
446
+ self.vocoder
447
+ ) as vocoder:
448
+ latents = best_latents
449
+ # Find the first occurrence of the "calm" token and trim the codes to that.
450
+ ctokens = 0
451
+ for k in range(codes.shape[-1]):
452
+ if codes[0, k] == calm_token:
453
+ ctokens += 1
454
+ else:
455
  ctokens = 0
456
+ if ctokens > 8: # 8 tokens gives the diffusion model some "breathing room" to terminate speech.
457
+ latents = latents[:, :k]
458
+ break
459
+ mel = do_spectrogram_diffusion(diffusion, diffuser, latents, diffusion_conditioning, temperature=diffusion_temperature,
460
+ verbose=verbose)
461
+ wav = vocoder.inference(mel)
462
+ wav_candidates.append(wav.cpu())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
463
 
464
  def potentially_redact(clip, text):
465
  if self.enable_redaction: