jbetker commited on
Commit
84d641c
1 Parent(s): da31baa
Files changed (2) hide show
  1. api.py +14 -6
  2. models/autoregressive.py +1 -1
api.py CHANGED
@@ -6,6 +6,7 @@ from urllib import request
6
  import torch
7
  import torch.nn.functional as F
8
  import progressbar
 
9
 
10
  from models.cvvp import CVVP
11
  from models.diffusion_decoder import DiffusionTts
@@ -118,29 +119,36 @@ def fix_autoregressive_output(codes, stop_token, complain=True):
118
  return codes
119
 
120
 
121
- def do_spectrogram_diffusion(diffusion_model, diffuser, mel_codes, conditioning_samples, temperature=1):
122
  """
123
  Uses the specified diffusion model to convert discrete codes into a spectrogram.
124
  """
125
  with torch.no_grad():
126
  cond_mels = []
127
  for sample in conditioning_samples:
 
 
128
  sample = pad_or_truncate(sample, 102400)
129
- cond_mel = wav_to_univnet_mel(sample.to(mel_codes.device), do_normalization=False)
130
  cond_mels.append(cond_mel)
131
  cond_mels = torch.stack(cond_mels, dim=1)
132
 
133
- output_seq_len = mel_codes.shape[1]*4*24000//22050 # This diffusion model converts from 22kHz spectrogram codes to a 24kHz spectrogram signal.
134
- output_shape = (mel_codes.shape[0], 100, output_seq_len)
135
- precomputed_embeddings = diffusion_model.timestep_independent(mel_codes, cond_mels, output_seq_len, False)
136
 
137
- noise = torch.randn(output_shape, device=mel_codes.device) * temperature
138
  mel = diffuser.p_sample_loop(diffusion_model, output_shape, noise=noise,
139
  model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings})
140
  return denormalize_tacotron_mel(mel)[:,:,:output_seq_len]
141
 
142
 
143
  class TextToSpeech:
 
 
 
 
 
144
  def __init__(self, autoregressive_batch_size=16):
145
  self.autoregressive_batch_size = autoregressive_batch_size
146
  self.tokenizer = VoiceBpeTokenizer()
 
6
  import torch
7
  import torch.nn.functional as F
8
  import progressbar
9
+ import torchaudio
10
 
11
  from models.cvvp import CVVP
12
  from models.diffusion_decoder import DiffusionTts
 
119
  return codes
120
 
121
 
122
+ def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_samples, temperature=1):
123
  """
124
  Uses the specified diffusion model to convert discrete codes into a spectrogram.
125
  """
126
  with torch.no_grad():
127
  cond_mels = []
128
  for sample in conditioning_samples:
129
+ # The diffuser operates at a sample rate of 24000 (except for the latent inputs)
130
+ sample = torchaudio.functional.resample(sample, 22050, 24000)
131
  sample = pad_or_truncate(sample, 102400)
132
+ cond_mel = wav_to_univnet_mel(sample.to(latents.device), do_normalization=False)
133
  cond_mels.append(cond_mel)
134
  cond_mels = torch.stack(cond_mels, dim=1)
135
 
136
+ output_seq_len = latents.shape[1] * 4 * 24000 // 22050 # This diffusion model converts from 22kHz spectrogram codes to a 24kHz spectrogram signal.
137
+ output_shape = (latents.shape[0], 100, output_seq_len)
138
+ precomputed_embeddings = diffusion_model.timestep_independent(latents, cond_mels, output_seq_len, False)
139
 
140
+ noise = torch.randn(output_shape, device=latents.device) * temperature
141
  mel = diffuser.p_sample_loop(diffusion_model, output_shape, noise=noise,
142
  model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings})
143
  return denormalize_tacotron_mel(mel)[:,:,:output_seq_len]
144
 
145
 
146
  class TextToSpeech:
147
+ """
148
+ Main entry point into Tortoise.
149
+ :param autoregressive_batch_size: Specifies how many samples to generate per batch. Lower this if you are seeing
150
+ GPU OOM errors. Larger numbers generates slightly faster.
151
+ """
152
  def __init__(self, autoregressive_batch_size=16):
153
  self.autoregressive_batch_size = autoregressive_batch_size
154
  self.tokenizer = VoiceBpeTokenizer()
models/autoregressive.py CHANGED
@@ -356,7 +356,7 @@ class UnifiedVoice(nn.Module):
356
  preformatting to create a working TTS model.
357
  """
358
  # Set padding areas within MEL (currently it is coded with the MEL code for <zero>).
359
- mel_lengths = wav_lengths // self.mel_length_compression
360
  for b in range(len(mel_lengths)):
361
  actual_end = mel_lengths[b] + 1 # Due to the convolutional nature of how these tokens are generated, it would be best if the model predicts a token past the actual last token.
362
  if actual_end < mel_input_tokens.shape[-1]:
 
356
  preformatting to create a working TTS model.
357
  """
358
  # Set padding areas within MEL (currently it is coded with the MEL code for <zero>).
359
+ mel_lengths = torch.div(wav_lengths, self.mel_length_compression, rounding_mode='trunc')
360
  for b in range(len(mel_lengths)):
361
  actual_end = mel_lengths[b] + 1 # Due to the convolutional nature of how these tokens are generated, it would be best if the model predicts a token past the actual last token.
362
  if actual_end < mel_input_tokens.shape[-1]: