Manmay commited on
Commit
b137e4d
1 Parent(s): 1370e35

fix memory leak

Browse files
Files changed (1) hide show
  1. tortoise/api.py +32 -51
tortoise/api.py CHANGED
@@ -243,28 +243,22 @@ class TextToSpeech:
243
  self.autoregressive = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30,
244
  model_dim=1024,
245
  heads=16, number_text_tokens=255, start_text_token=255, checkpointing=False,
246
- train_solo_embeddings=False).cpu().eval()
247
  self.autoregressive.load_state_dict(torch.load(get_model_path('autoregressive.pth', models_dir)), strict=False)
248
  self.autoregressive.post_init_gpt2_config(use_deepspeed=use_deepspeed, kv_cache=kv_cache, half=self.half)
249
 
250
  self.diffusion = DiffusionTts(model_channels=1024, num_layers=10, in_channels=100, out_channels=200,
251
  in_latent_channels=1024, in_tokens=8193, dropout=0, use_fp16=False, num_heads=16,
252
- layer_drop=0, unconditioned_percentage=0).cpu().eval()
253
  self.diffusion.load_state_dict(torch.load(get_model_path('diffusion_decoder.pth', models_dir)))
254
 
255
- self.vocoder = UnivNetGenerator().cpu()
256
  self.vocoder.load_state_dict(torch.load(get_model_path('vocoder.pth', models_dir), map_location=torch.device('cpu'))['model_g'])
257
  self.vocoder.eval(inference=True)
258
 
259
  # Random latent generators (RLGs) are loaded lazily.
260
  self.rlg_auto = None
261
  self.rlg_diffusion = None
262
- @contextmanager
263
- def temporary_cuda(self, model):
264
- m = model.to(self.device)
265
- yield m
266
- m = model.cpu()
267
-
268
  def get_conditioning_latents(self, voice_samples, return_mels=False):
269
  """
270
  Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
@@ -328,7 +322,6 @@ class TextToSpeech:
328
  # Presets are defined here.
329
  presets = {
330
  'ultra_fast': {'num_autoregressive_samples': 1, 'diffusion_iterations': 15},
331
- # 'ultra_fast': {'num_autoregressive_samples': 16, 'diffusion_iterations': 30},
332
  'fast': {'num_autoregressive_samples': 32, 'diffusion_iterations': 50},
333
  'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 200},
334
  'high_quality': {'num_autoregressive_samples': 256, 'diffusion_iterations': 400},
@@ -409,57 +402,45 @@ class TextToSpeech:
409
  diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k)
410
 
411
  with torch.no_grad():
412
-
413
- stop_mel_token = self.autoregressive.stop_mel_token
414
  calm_token = 83 # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output"
415
  if verbose:
416
  print("Generating autoregressive samples..")
417
- with self.temporary_cuda(self.autoregressive
418
- ) as autoregressive, torch.autocast(device_type="cuda", dtype=torch.float16, enabled=self.half):
419
- codes = autoregressive.inference_speech(auto_conditioning, text_tokens,
420
- do_sample=True,
421
- top_p=top_p,
422
- temperature=temperature,
423
- num_return_sequences=num_autoregressive_samples,
424
- length_penalty=length_penalty,
425
- repetition_penalty=repetition_penalty,
426
- max_generate_length=max_mel_tokens,
427
- **hf_generate_kwargs)
428
  # The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
429
  # inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
430
  # results, but will increase memory usage.
431
- with self.temporary_cuda(
432
- self.autoregressive
433
- ) as autoregressive, torch.autocast(
434
- device_type="cuda" if not torch.backends.mps.is_available() else 'mps', dtype=torch.float16, enabled=self.half
435
- ):
436
- best_latents = autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1),
437
- torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), codes,
438
- torch.tensor([codes.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device),
439
- return_latent=True, clip_inputs=False)
440
- del auto_conditioning
441
 
442
  if verbose:
443
  print("Transforming autoregressive outputs into audio..")
444
  wav_candidates = []
445
- with self.temporary_cuda(self.diffusion) as diffusion, self.temporary_cuda(
446
- self.vocoder
447
- ) as vocoder:
448
- latents = best_latents
449
- # Find the first occurrence of the "calm" token and trim the codes to that.
450
- ctokens = 0
451
- for k in range(codes.shape[-1]):
452
- if codes[0, k] == calm_token:
453
- ctokens += 1
454
- else:
455
- ctokens = 0
456
- if ctokens > 8: # 8 tokens gives the diffusion model some "breathing room" to terminate speech.
457
- latents = latents[:, :k]
458
- break
459
- mel = do_spectrogram_diffusion(diffusion, diffuser, latents, diffusion_conditioning, temperature=diffusion_temperature,
460
- verbose=verbose)
461
- wav = vocoder.inference(mel)
462
- wav_candidates.append(wav.cpu())
463
 
464
  def potentially_redact(clip, text):
465
  if self.enable_redaction:
 
243
  self.autoregressive = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30,
244
  model_dim=1024,
245
  heads=16, number_text_tokens=255, start_text_token=255, checkpointing=False,
246
+ train_solo_embeddings=False).cuda().eval()
247
  self.autoregressive.load_state_dict(torch.load(get_model_path('autoregressive.pth', models_dir)), strict=False)
248
  self.autoregressive.post_init_gpt2_config(use_deepspeed=use_deepspeed, kv_cache=kv_cache, half=self.half)
249
 
250
  self.diffusion = DiffusionTts(model_channels=1024, num_layers=10, in_channels=100, out_channels=200,
251
  in_latent_channels=1024, in_tokens=8193, dropout=0, use_fp16=False, num_heads=16,
252
+ layer_drop=0, unconditioned_percentage=0).cuda().eval()
253
  self.diffusion.load_state_dict(torch.load(get_model_path('diffusion_decoder.pth', models_dir)))
254
 
255
+ self.vocoder = UnivNetGenerator().cuda()
256
  self.vocoder.load_state_dict(torch.load(get_model_path('vocoder.pth', models_dir), map_location=torch.device('cpu'))['model_g'])
257
  self.vocoder.eval(inference=True)
258
 
259
  # Random latent generators (RLGs) are loaded lazily.
260
  self.rlg_auto = None
261
  self.rlg_diffusion = None
 
 
 
 
 
 
262
  def get_conditioning_latents(self, voice_samples, return_mels=False):
263
  """
264
  Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
 
322
  # Presets are defined here.
323
  presets = {
324
  'ultra_fast': {'num_autoregressive_samples': 1, 'diffusion_iterations': 15},
 
325
  'fast': {'num_autoregressive_samples': 32, 'diffusion_iterations': 50},
326
  'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 200},
327
  'high_quality': {'num_autoregressive_samples': 256, 'diffusion_iterations': 400},
 
402
  diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k)
403
 
404
  with torch.no_grad():
 
 
405
  calm_token = 83 # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output"
406
  if verbose:
407
  print("Generating autoregressive samples..")
408
+ codes = self.autoregressive.inference_speech(auto_conditioning, text_tokens,
409
+ do_sample=True,
410
+ top_p=top_p,
411
+ temperature=temperature,
412
+ num_return_sequences=num_autoregressive_samples,
413
+ length_penalty=length_penalty,
414
+ repetition_penalty=repetition_penalty,
415
+ max_generate_length=max_mel_tokens,
416
+ **hf_generate_kwargs)
 
 
417
  # The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
418
  # inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
419
  # results, but will increase memory usage.
420
+ best_latents = self.autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1),
421
+ torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), codes,
422
+ torch.tensor([codes.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device),
423
+ return_latent=True, clip_inputs=False)
424
+ del auto_conditioning
 
 
 
 
 
425
 
426
  if verbose:
427
  print("Transforming autoregressive outputs into audio..")
428
  wav_candidates = []
429
+ latents = best_latents
430
+ # Find the first occurrence of the "calm" token and trim the codes to that.
431
+ ctokens = 0
432
+ for k in range(codes.shape[-1]):
433
+ if codes[0, k] == calm_token:
434
+ ctokens += 1
435
+ else:
436
+ ctokens = 0
437
+ if ctokens > 8: # 8 tokens gives the diffusion model some "breathing room" to terminate speech.
438
+ latents = latents[:, :k]
439
+ break
440
+ mel = do_spectrogram_diffusion(self.diffusion, diffuser, latents, diffusion_conditioning, temperature=diffusion_temperature,
441
+ verbose=verbose)
442
+ wav = self.vocoder.inference(mel)
443
+ wav_candidates.append(wav.cpu())
 
 
 
444
 
445
  def potentially_redact(clip, text):
446
  if self.enable_redaction: