jbetker commited on
Commit
ad0f3fd
1 Parent(s): a578697

update to v2 models (clvp pending)

Browse files
Files changed (1) hide show
  1. api.py +10 -21
api.py CHANGED
@@ -23,9 +23,11 @@ from utils.tokenizer import VoiceBpeTokenizer, lev_distance
23
  pbar = None
24
  def download_models():
25
  MODELS = {
26
- 'clip.pth': 'https://huggingface.co/jbetker/tortoise-tts-clip/resolve/main/pytorch-model.bin',
27
- 'diffusion.pth': 'https://huggingface.co/jbetker/tortoise-tts-diffusion-v1/resolve/main/pytorch-model.bin',
28
- 'autoregressive.pth': 'https://huggingface.co/jbetker/tortoise-tts-autoregressive/resolve/main/pytorch-model.bin'
 
 
29
  }
30
  os.makedirs('.models', exist_ok=True)
31
  def show_progress(block_num, block_size, total_size):
@@ -162,25 +164,12 @@ class TextToSpeech:
162
  train_solo_embeddings=False,
163
  average_conditioning_embeddings=True).cpu().eval()
164
  self.autoregressive.load_state_dict(torch.load('.models/autoregressive.pth'))
165
- '''
166
- self.autoregressive = UnifiedVoice(max_mel_tokens=2048, max_text_tokens=1024, max_conditioning_inputs=1, layers=42,
167
- model_dim=1152, heads=18, number_text_tokens=256, train_solo_embeddings=False,
168
- average_conditioning_embeddings=True, types=2).cpu().eval()
169
- self.autoregressive.load_state_dict(torch.load('X:\\dlas\\experiments\\train_gpt_tts_xl\\models\\15250_gpt_ema.pth'))
170
- '''
171
-
172
- self.autoregressive_for_diffusion = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30,
173
- model_dim=1024,
174
- heads=16, number_text_tokens=255, start_text_token=255, checkpointing=False,
175
- train_solo_embeddings=False,
176
- average_conditioning_embeddings=True).cpu().eval()
177
- self.autoregressive_for_diffusion.load_state_dict(torch.load('.models/autoregressive.pth'))
178
 
179
  self.clvp = CLVP(dim_text=512, dim_speech=512, dim_latent=512, num_text_tokens=256, text_enc_depth=12,
180
  text_seq_len=350, text_heads=8,
181
  num_speech_tokens=8192, speech_enc_depth=12, speech_heads=8, speech_seq_len=430,
182
  use_xformers=True).cpu().eval()
183
- self.clvp.load_state_dict(torch.load('.models/clip.pth'))
184
 
185
  self.cvvp = CVVP(model_dim=512, transformer_heads=8, dropout=0, mel_codes=8192, conditioning_enc_depth=8, cond_mask_percentage=0,
186
  speech_enc_depth=8, speech_mask_percentage=0, latent_multiplier=1).cpu().eval()
@@ -281,11 +270,11 @@ class TextToSpeech:
281
  # The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
282
  # inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
283
  # results, but will increase memory usage.
284
- self.autoregressive_for_diffusion = self.autoregressive_for_diffusion.cuda()
285
- best_latents = self.autoregressive_for_diffusion(conds, text, torch.tensor([text.shape[-1]], device=conds.device), best_results,
286
- torch.tensor([best_results.shape[-1]*self.autoregressive_for_diffusion.mel_length_compression], device=conds.device),
287
  return_latent=True, clip_inputs=False)
288
- self.autoregressive_for_diffusion = self.autoregressive_for_diffusion.cpu()
289
 
290
  print("Performing vocoding..")
291
  wav_candidates = []
 
23
  pbar = None
24
  def download_models():
25
  MODELS = {
26
+ 'autoregressive.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/autoregressive.pth',
27
+ 'clvp.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/clip.pth',
28
+ 'clip.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/cvvp.pth',
29
+ 'diffusion_decoder.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/diffusion_decoder.pth',
30
+ 'vocoder.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/vocoder.pth',
31
  }
32
  os.makedirs('.models', exist_ok=True)
33
  def show_progress(block_num, block_size, total_size):
 
164
  train_solo_embeddings=False,
165
  average_conditioning_embeddings=True).cpu().eval()
166
  self.autoregressive.load_state_dict(torch.load('.models/autoregressive.pth'))
 
 
 
 
 
 
 
 
 
 
 
 
 
167
 
168
  self.clvp = CLVP(dim_text=512, dim_speech=512, dim_latent=512, num_text_tokens=256, text_enc_depth=12,
169
  text_seq_len=350, text_heads=8,
170
  num_speech_tokens=8192, speech_enc_depth=12, speech_heads=8, speech_seq_len=430,
171
  use_xformers=True).cpu().eval()
172
+ self.clvp.load_state_dict(torch.load('.models/clvp.pth'))
173
 
174
  self.cvvp = CVVP(model_dim=512, transformer_heads=8, dropout=0, mel_codes=8192, conditioning_enc_depth=8, cond_mask_percentage=0,
175
  speech_enc_depth=8, speech_mask_percentage=0, latent_multiplier=1).cpu().eval()
 
270
  # The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
271
  # inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
272
  # results, but will increase memory usage.
273
+ self.autoregressive = self.autoregressive.cuda()
274
+ best_latents = self.autoregressive(conds, text, torch.tensor([text.shape[-1]], device=conds.device), best_results,
275
+ torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=conds.device),
276
  return_latent=True, clip_inputs=False)
277
+ self.autoregressive = self.autoregressive.cpu()
278
 
279
  print("Performing vocoding..")
280
  wav_candidates = []