jbetker commited on
Commit
56f8385
1 Parent(s): 3214ca0

Update sweep & eval_multiple with new voices

Browse files
Files changed (4) hide show
  1. api.py +10 -3
  2. eval_multiple.py +23 -20
  3. read.py +3 -14
  4. sweep.py +14 -10
api.py CHANGED
@@ -140,6 +140,13 @@ class TextToSpeech:
140
  average_conditioning_embeddings=True).cpu().eval()
141
  self.autoregressive.load_state_dict(torch.load('.models/autoregressive_diverse.pth'))
142
 
 
 
 
 
 
 
 
143
  self.clip = VoiceCLIP(dim_text=512, dim_speech=512, dim_latent=512, num_text_tokens=256, text_enc_depth=12,
144
  text_seq_len=350, text_heads=8,
145
  num_speech_tokens=8192, speech_enc_depth=12, speech_heads=8, speech_seq_len=430,
@@ -221,11 +228,11 @@ class TextToSpeech:
221
  # The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
222
  # inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
223
  # results, but will increase memory usage.
224
- self.autoregressive = self.autoregressive.cuda()
225
- best_latents = self.autoregressive(conds, text, torch.tensor([text.shape[-1]], device=conds.device), best_results,
226
  torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=conds.device),
227
  return_latent=True, clip_inputs=False)
228
- self.autoregressive = self.autoregressive.cpu()
229
 
230
  print("Performing vocoding..")
231
  wav_candidates = []
 
140
  average_conditioning_embeddings=True).cpu().eval()
141
  self.autoregressive.load_state_dict(torch.load('.models/autoregressive_diverse.pth'))
142
 
143
+ self.autoregressive_for_latents = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30,
144
+ model_dim=1024,
145
+ heads=16, number_text_tokens=256, start_text_token=255, checkpointing=False,
146
+ train_solo_embeddings=False,
147
+ average_conditioning_embeddings=True).cpu().eval()
148
+ self.autoregressive_for_latents.load_state_dict(torch.load('.models/autoregressive_diverse.pth'))
149
+
150
  self.clip = VoiceCLIP(dim_text=512, dim_speech=512, dim_latent=512, num_text_tokens=256, text_enc_depth=12,
151
  text_seq_len=350, text_heads=8,
152
  num_speech_tokens=8192, speech_enc_depth=12, speech_heads=8, speech_seq_len=430,
 
228
  # The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
229
  # inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
230
  # results, but will increase memory usage.
231
+ self.autoregressive_for_latents = self.autoregressive_for_latents.cuda()
232
+ best_latents = self.autoregressive_for_latents(conds, text, torch.tensor([text.shape[-1]], device=conds.device), best_results,
233
  torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=conds.device),
234
  return_latent=True, clip_inputs=False)
235
+ self.autoregressive_for_latents = self.autoregressive_for_latents.cpu()
236
 
237
  print("Performing vocoding..")
238
  wav_candidates = []
eval_multiple.py CHANGED
@@ -6,32 +6,35 @@ from api import TextToSpeech
6
  from utils.audio import load_audio
7
 
8
  if __name__ == '__main__':
9
- fname = 'Y:\\libritts\\test-clean\\transcribed-brief-w2v.tsv'
10
- outpath = 'D:\\tmp\\tortoise-tts-eval\\diverse_new_decoder_1'
 
11
  outpath_real = 'D:\\tmp\\tortoise-tts-eval\\real'
12
 
13
- os.makedirs(outpath, exist_ok=True)
14
  os.makedirs(outpath_real, exist_ok=True)
15
  with open(fname, 'r', encoding='utf-8') as f:
16
  lines = [l.strip().split('\t') for l in f.readlines()]
17
 
18
- recorder = open(os.path.join(outpath, 'transcript.tsv'), 'w', encoding='utf-8')
19
  tts = TextToSpeech()
20
- for e, line in enumerate(lines):
21
- transcript = line[0]
22
- if len(transcript) > 120:
23
- continue # We need to support this, but cannot yet.
24
- path = os.path.join(os.path.dirname(fname), line[1])
25
- cond_audio = load_audio(path, 22050)
26
- torchaudio.save(os.path.join(outpath_real, os.path.basename(line[1])), cond_audio, 22050)
27
- sample = tts.tts(transcript, [cond_audio, cond_audio], num_autoregressive_samples=256, k=1,
28
- repetition_penalty=2.0, length_penalty=2, temperature=.5, top_p=.5,
29
- diffusion_temperature=.7, cond_free_k=2, diffusion_iterations=100)
 
 
 
 
30
 
31
- down = torchaudio.functional.resample(sample, 24000, 22050)
32
- fout_path = os.path.join(outpath, os.path.basename(line[1]))
33
- torchaudio.save(fout_path, down.squeeze(0), 22050)
34
 
35
- recorder.write(f'{transcript}\t{fout_path}\n')
36
- recorder.flush()
37
- recorder.close()
 
6
  from utils.audio import load_audio
7
 
8
  if __name__ == '__main__':
9
+ fname = 'Y:\\clips\\books2\\subset512-oco.tsv'
10
+ stop_after = 128
11
+ outpath_base = 'D:\\tmp\\tortoise-tts-eval\\diverse'
12
  outpath_real = 'D:\\tmp\\tortoise-tts-eval\\real'
13
 
 
14
  os.makedirs(outpath_real, exist_ok=True)
15
  with open(fname, 'r', encoding='utf-8') as f:
16
  lines = [l.strip().split('\t') for l in f.readlines()]
17
 
 
18
  tts = TextToSpeech()
19
+ for k in range(4):
20
+ outpath = f'{outpath_base}_{k}'
21
+ os.makedirs(outpath, exist_ok=True)
22
+ recorder = open(os.path.join(outpath, 'transcript.tsv'), 'w', encoding='utf-8')
23
+ for e, line in enumerate(lines):
24
+ if e >= stop_after:
25
+ break
26
+ transcript = line[0]
27
+ path = os.path.join(os.path.dirname(fname), line[1])
28
+ cond_audio = load_audio(path, 22050)
29
+ torchaudio.save(os.path.join(outpath_real, os.path.basename(line[1])), cond_audio, 22050)
30
+ sample = tts.tts(transcript, [cond_audio, cond_audio], num_autoregressive_samples=128, k=1,
31
+ repetition_penalty=2.0, length_penalty=2, temperature=.5, top_p=.5,
32
+ diffusion_temperature=.7, cond_free_k=2, diffusion_iterations=70)
33
 
34
+ down = torchaudio.functional.resample(sample, 24000, 22050)
35
+ fout_path = os.path.join(outpath, os.path.basename(line[1]))
36
+ torchaudio.save(fout_path, down.squeeze(0), 22050)
37
 
38
+ recorder.write(f'{transcript}\t{fout_path}\n')
39
+ recorder.flush()
40
+ recorder.close()
read.py CHANGED
@@ -30,24 +30,13 @@ if __name__ == '__main__':
30
  # These are voices drawn randomly from the training set. You are free to substitute your own voices in, but testing
31
  # has shown that the model does not generalize to new voices very well.
32
  preselected_cond_voices = {
33
- # Male voices
34
- 'dotrice': ['voices/dotrice/1.wav', 'voices/dotrice/2.wav'],
35
- 'harris': ['voices/harris/1.wav', 'voices/harris/2.wav'],
36
- 'lescault': ['voices/lescault/1.wav', 'voices/lescault/2.wav'],
37
- 'otto': ['voices/otto/1.wav', 'voices/otto/2.wav'],
38
- 'obama': ['voices/obama/1.wav', 'voices/obama/2.wav'],
39
- 'carlin': ['voices/carlin/1.wav', 'voices/carlin/2.wav'],
40
- # Female voices
41
- 'atkins': ['voices/atkins/1.wav', 'voices/atkins/2.wav'],
42
- 'grace': ['voices/grace/1.wav', 'voices/grace/2.wav'],
43
- 'kennard': ['voices/kennard/1.wav', 'voices/kennard/2.wav'],
44
- 'mol': ['voices/mol/1.wav', 'voices/mol/2.wav'],
45
- 'lj': ['voices/lj/1.wav', 'voices/lj/2.wav'],
46
  }
47
 
48
  parser = argparse.ArgumentParser()
49
  parser.add_argument('-textfile', type=str, help='A file containing the text to read.', default="data/riding_hood.txt")
50
- parser.add_argument('-voice', type=str, help='Use a preset conditioning voice (defined above). Overrides cond_path.', default='dotrice')
51
  parser.add_argument('-num_samples', type=int, help='How many total outputs the autoregressive transformer should produce.', default=512)
52
  parser.add_argument('-batch_size', type=int, help='How many samples to process at once in the autoregressive model.', default=16)
53
  parser.add_argument('-output_path', type=str, help='Where to store outputs.', default='results/longform/')
 
30
  # These are voices drawn randomly from the training set. You are free to substitute your own voices in, but testing
31
  # has shown that the model does not generalize to new voices very well.
32
  preselected_cond_voices = {
33
+ 'emma_stone': ['voices/emma_stone/1.wav','voices/emma_stone/2.wav','voices/emma_stone/3.wav'],
34
+ 'tom_hanks': ['voices/tom_hanks/1.wav','voices/tom_hanks/2.wav','voices/tom_hanks/3.wav'],
 
 
 
 
 
 
 
 
 
 
 
35
  }
36
 
37
  parser = argparse.ArgumentParser()
38
  parser.add_argument('-textfile', type=str, help='A file containing the text to read.', default="data/riding_hood.txt")
39
+ parser.add_argument('-voice', type=str, help='Use a preset conditioning voice (defined above). Overrides cond_path.', default='emma_stone')
40
  parser.add_argument('-num_samples', type=int, help='How many total outputs the autoregressive transformer should produce.', default=512)
41
  parser.add_argument('-batch_size', type=int, help='How many samples to process at once in the autoregressive model.', default=16)
42
  parser.add_argument('-output_path', type=str, help='Where to store outputs.', default='results/longform/')
sweep.py CHANGED
@@ -24,19 +24,24 @@ def permutations(args):
24
 
25
 
26
  if __name__ == '__main__':
27
- fname = 'Y:\\libritts\\test-clean\\transcribed-brief-w2v.tsv'
28
- outpath_base = 'D:\\tmp\\tortoise-tts-eval\\std_sweep3'
 
29
  outpath_real = 'D:\\tmp\\tortoise-tts-eval\\real'
30
 
31
  arg_ranges = {
32
- 'top_p': [.3,.4,.5,.6],
33
- 'temperature': [.5, .6],
 
 
 
34
  }
35
  cfgs = permutations(arg_ranges)
36
  shuffle(cfgs)
37
 
38
  for cfg in cfgs:
39
- outpath = os.path.join(outpath_base, f'{cfg["top_p"]}_{cfg["temperature"]}')
 
40
  os.makedirs(outpath, exist_ok=True)
41
  os.makedirs(outpath_real, exist_ok=True)
42
  with open(fname, 'r', encoding='utf-8') as f:
@@ -45,15 +50,14 @@ if __name__ == '__main__':
45
  recorder = open(os.path.join(outpath, 'transcript.tsv'), 'w', encoding='utf-8')
46
  tts = TextToSpeech()
47
  for e, line in enumerate(lines):
 
 
48
  transcript = line[0]
49
- if len(transcript) > 120:
50
- continue # We need to support this, but cannot yet.
51
  path = os.path.join(os.path.dirname(fname), line[1])
52
  cond_audio = load_audio(path, 22050)
53
  torchaudio.save(os.path.join(outpath_real, os.path.basename(line[1])), cond_audio, 22050)
54
- sample = tts.tts(transcript, [cond_audio, cond_audio], num_autoregressive_samples=256, k=1, diffusion_iterations=200,
55
- repetition_penalty=2.0, length_penalty=2, temperature=.5, top_p=.5,
56
- diffusion_temperature=.7, cond_free_k=2, **cfg)
57
  down = torchaudio.functional.resample(sample, 24000, 22050)
58
  fout_path = os.path.join(outpath, os.path.basename(line[1]))
59
  torchaudio.save(fout_path, down.squeeze(0), 22050)
 
24
 
25
 
26
  if __name__ == '__main__':
27
+ fname = 'Y:\\clips\\books2\\subset512-oco.tsv'
28
+ stop_after = 128
29
+ outpath_base = 'D:\\tmp\\tortoise-tts-eval\\sweep'
30
  outpath_real = 'D:\\tmp\\tortoise-tts-eval\\real'
31
 
32
  arg_ranges = {
33
+ 'top_p': [.5, 1],
34
+ 'temperature': [.5, 1],
35
+ 'diffusion_temperature': [.6, 1],
36
+ 'cond_free_k': [0, 1, 4],
37
+ 'repetition_penalty': [1.0, 2.0]
38
  }
39
  cfgs = permutations(arg_ranges)
40
  shuffle(cfgs)
41
 
42
  for cfg in cfgs:
43
+ cfg_desc = '_'.join([f'{k}-{v}' for k,v in cfg.items()])
44
+ outpath = os.path.join(outpath_base, f'{cfg_desc}')
45
  os.makedirs(outpath, exist_ok=True)
46
  os.makedirs(outpath_real, exist_ok=True)
47
  with open(fname, 'r', encoding='utf-8') as f:
 
50
  recorder = open(os.path.join(outpath, 'transcript.tsv'), 'w', encoding='utf-8')
51
  tts = TextToSpeech()
52
  for e, line in enumerate(lines):
53
+ if e >= stop_after:
54
+ break
55
  transcript = line[0]
 
 
56
  path = os.path.join(os.path.dirname(fname), line[1])
57
  cond_audio = load_audio(path, 22050)
58
  torchaudio.save(os.path.join(outpath_real, os.path.basename(line[1])), cond_audio, 22050)
59
+ sample = tts.tts(transcript, [cond_audio, cond_audio], num_autoregressive_samples=256,
60
+ k=1, diffusion_iterations=70, length_penalty=1.0, **cfg)
 
61
  down = torchaudio.functional.resample(sample, 24000, 22050)
62
  fout_path = os.path.join(outpath, os.path.basename(line[1]))
63
  torchaudio.save(fout_path, down.squeeze(0), 22050)