jbetker commited on
Commit
f499d66
1 Parent(s): 2888ae0

misc fixes

Browse files
tortoise/api.py CHANGED
@@ -37,6 +37,8 @@ def download_models(specific_models=None):
37
  'cvvp.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/cvvp.pth',
38
  'diffusion_decoder.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/diffusion_decoder.pth',
39
  'vocoder.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/vocoder.pth',
 
 
40
  }
41
  os.makedirs('.models', exist_ok=True)
42
  def show_progress(block_num, block_size, total_size):
@@ -110,9 +112,9 @@ def fix_autoregressive_output(codes, stop_token, complain=True):
110
  stop_token_indices = (codes == stop_token).nonzero()
111
  if len(stop_token_indices) == 0:
112
  if complain:
113
- print("No stop tokens found. This typically means the spoken audio is too long. In some cases, the output "
114
- "will still be good, though. Listen to it and if it is missing words, try breaking up your input "
115
- "text.")
116
  return codes
117
  else:
118
  codes[stop_token_indices] = 83
@@ -163,8 +165,7 @@ class TextToSpeech:
163
  Main entry point into Tortoise.
164
  """
165
 
166
- def __init__(self, autoregressive_batch_size=16, models_dir='.models', enable_redaction=True,
167
- save_random_voices=False):
168
  """
169
  Constructor
170
  :param autoregressive_batch_size: Specifies how many samples to generate per batch. Lower this if you are seeing
@@ -174,14 +175,11 @@ class TextToSpeech:
174
  :param enable_redaction: When true, text enclosed in brackets are automatically redacted from the spoken output
175
  (but are still rendered by the model). This can be used for prompt engineering.
176
  Default is true.
177
- :param save_random_voices: When true, voices that are randomly generated are saved to the `random_voices`
178
- directory. Default is false.
179
  """
180
  self.autoregressive_batch_size = autoregressive_batch_size
181
  self.enable_redaction = enable_redaction
182
  if self.enable_redaction:
183
  self.aligner = Wav2VecAlignment()
184
- self.save_random_voices = save_random_voices
185
 
186
  self.tokenizer = VoiceBpeTokenizer()
187
  download_models()
@@ -220,29 +218,6 @@ class TextToSpeech:
220
  self.rlg_auto = None
221
  self.rlg_diffusion = None
222
 
223
- def tts_with_preset(self, text, preset='fast', **kwargs):
224
- """
225
- Calls TTS with one of a set of preset generation parameters. Options:
226
- 'ultra_fast': Produces speech at a speed which belies the name of this repo. (Not really, but it's definitely fastest).
227
- 'fast': Decent quality speech at a decent inference rate. A good choice for mass inference.
228
- 'standard': Very good quality. This is generally about as good as you are going to get.
229
- 'high_quality': Use if you want the absolute best. This is not really worth the compute, though.
230
- """
231
- # Use generally found best tuning knobs for generation.
232
- kwargs.update({'temperature': .8, 'length_penalty': 1.0, 'repetition_penalty': 2.0,
233
- #'typical_sampling': True,
234
- 'top_p': .8,
235
- 'cond_free_k': 2.0, 'diffusion_temperature': 1.0})
236
- # Presets are defined here.
237
- presets = {
238
- 'ultra_fast': {'num_autoregressive_samples': 16, 'diffusion_iterations': 32, 'cond_free': False},
239
- 'fast': {'num_autoregressive_samples': 96, 'diffusion_iterations': 32},
240
- 'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 128},
241
- 'high_quality': {'num_autoregressive_samples': 512, 'diffusion_iterations': 1024},
242
- }
243
- kwargs.update(presets[preset])
244
- return self.tts(text, **kwargs)
245
-
246
  def get_conditioning_latents(self, voice_samples, return_mels=False):
247
  """
248
  Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
@@ -288,11 +263,30 @@ class TextToSpeech:
288
  self.rlg_diffusion = RandomLatentConverter(2048).eval()
289
  self.rlg_diffusion.load_state_dict(torch.load('.models/rlg_diffuser.pth', map_location=torch.device('cpu')))
290
  with torch.no_grad():
291
- latents = self.rlg_auto(torch.tensor([0.0])), self.rlg_diffusion(torch.tensor([0.0]))
292
- if self.save_random_voices:
293
- os.makedirs('random_voices', exist_ok=True)
294
- torch.save(latents, f'random_voices/{str(uuid.uuid4())}.pth')
295
- return latents
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
 
297
  def tts(self, text, voice_samples=None, conditioning_latents=None, k=1, verbose=True,
298
  # autoregressive generation parameters follow
@@ -452,7 +446,7 @@ class TextToSpeech:
452
 
453
  def potentially_redact(clip, text):
454
  if self.enable_redaction:
455
- return self.aligner.redact(clip, text)
456
  return clip
457
  wav_candidates = [potentially_redact(wav_candidate, text) for wav_candidate in wav_candidates]
458
  if len(wav_candidates) > 1:
 
37
  'cvvp.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/cvvp.pth',
38
  'diffusion_decoder.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/diffusion_decoder.pth',
39
  'vocoder.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/vocoder.pth',
40
+ 'rlg_auto.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/rlg_auto.pth',
41
+ 'rlg_diffuser.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/rlg_diffuser.pth',
42
  }
43
  os.makedirs('.models', exist_ok=True)
44
  def show_progress(block_num, block_size, total_size):
 
112
  stop_token_indices = (codes == stop_token).nonzero()
113
  if len(stop_token_indices) == 0:
114
  if complain:
115
+ print("No stop tokens found in one of the generated voice clips. This typically means the spoken audio is "
116
+ "too long. In some cases, the output will still be good, though. Listen to it and if it is missing words, "
117
+ "try breaking up your input text.")
118
  return codes
119
  else:
120
  codes[stop_token_indices] = 83
 
165
  Main entry point into Tortoise.
166
  """
167
 
168
+ def __init__(self, autoregressive_batch_size=16, models_dir='.models', enable_redaction=True):
 
169
  """
170
  Constructor
171
  :param autoregressive_batch_size: Specifies how many samples to generate per batch. Lower this if you are seeing
 
175
  :param enable_redaction: When true, text enclosed in brackets are automatically redacted from the spoken output
176
  (but are still rendered by the model). This can be used for prompt engineering.
177
  Default is true.
 
 
178
  """
179
  self.autoregressive_batch_size = autoregressive_batch_size
180
  self.enable_redaction = enable_redaction
181
  if self.enable_redaction:
182
  self.aligner = Wav2VecAlignment()
 
183
 
184
  self.tokenizer = VoiceBpeTokenizer()
185
  download_models()
 
218
  self.rlg_auto = None
219
  self.rlg_diffusion = None
220
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  def get_conditioning_latents(self, voice_samples, return_mels=False):
222
  """
223
  Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
 
263
  self.rlg_diffusion = RandomLatentConverter(2048).eval()
264
  self.rlg_diffusion.load_state_dict(torch.load('.models/rlg_diffuser.pth', map_location=torch.device('cpu')))
265
  with torch.no_grad():
266
+ return self.rlg_auto(torch.tensor([0.0])), self.rlg_diffusion(torch.tensor([0.0]))
267
+
268
+ def tts_with_preset(self, text, preset='fast', **kwargs):
269
+ """
270
+ Calls TTS with one of a set of preset generation parameters. Options:
271
+ 'ultra_fast': Produces speech at a speed which belies the name of this repo. (Not really, but it's definitely fastest).
272
+ 'fast': Decent quality speech at a decent inference rate. A good choice for mass inference.
273
+ 'standard': Very good quality. This is generally about as good as you are going to get.
274
+ 'high_quality': Use if you want the absolute best. This is not really worth the compute, though.
275
+ """
276
+ # Use generally found best tuning knobs for generation.
277
+ kwargs.update({'temperature': .8, 'length_penalty': 1.0, 'repetition_penalty': 2.0,
278
+ #'typical_sampling': True,
279
+ 'top_p': .8,
280
+ 'cond_free_k': 2.0, 'diffusion_temperature': 1.0})
281
+ # Presets are defined here.
282
+ presets = {
283
+ 'ultra_fast': {'num_autoregressive_samples': 16, 'diffusion_iterations': 32, 'cond_free': False},
284
+ 'fast': {'num_autoregressive_samples': 96, 'diffusion_iterations': 32},
285
+ 'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 128},
286
+ 'high_quality': {'num_autoregressive_samples': 512, 'diffusion_iterations': 1024},
287
+ }
288
+ kwargs.update(presets[preset])
289
+ return self.tts(text, **kwargs)
290
 
291
  def tts(self, text, voice_samples=None, conditioning_latents=None, k=1, verbose=True,
292
  # autoregressive generation parameters follow
 
446
 
447
  def potentially_redact(clip, text):
448
  if self.enable_redaction:
449
+ return self.aligner.redact(clip.squeeze(1), text).unsqueeze(1)
450
  return clip
451
  wav_candidates = [potentially_redact(wav_candidate, text) for wav_candidate in wav_candidates]
452
  if len(wav_candidates) > 1:
tortoise/do_tts.py CHANGED
@@ -8,7 +8,7 @@ from tortoise.utils.audio import load_audio, get_voices, load_voice
8
 
9
  if __name__ == '__main__':
10
  parser = argparse.ArgumentParser()
11
- parser.add_argument('--text', type=str, help='Text to speak.', default="I am a language model that has learned to speak.")
12
  parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
13
  'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='random')
14
  parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='fast')
@@ -21,7 +21,7 @@ if __name__ == '__main__':
21
  args = parser.parse_args()
22
  os.makedirs(args.output_path, exist_ok=True)
23
 
24
- tts = TextToSpeech(models_dir=args.model_dir, save_random_voices=True)
25
 
26
  selected_voices = args.voice.split(',')
27
  for k, voice in enumerate(selected_voices):
 
8
 
9
  if __name__ == '__main__':
10
  parser = argparse.ArgumentParser()
11
+ parser.add_argument('--text', type=str, help='Text to speak.', default="The expressiveness of autoregressive transformers is literally nuts! I absolutely adore them.")
12
  parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
13
  'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='random')
14
  parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='fast')
 
21
  args = parser.parse_args()
22
  os.makedirs(args.output_path, exist_ok=True)
23
 
24
+ tts = TextToSpeech(models_dir=args.model_dir)
25
 
26
  selected_voices = args.voice.split(',')
27
  for k, voice in enumerate(selected_voices):
tortoise/is_this_from_tortoise.py CHANGED
@@ -5,7 +5,7 @@ from tortoise.utils.audio import load_audio
5
 
6
  if __name__ == '__main__':
7
  parser = argparse.ArgumentParser()
8
- parser.add_argument('--clip', type=str, help='Path to an audio clip to classify.', default="results/favorite_riding_hood.mp3")
9
  args = parser.parse_args()
10
 
11
  clip = load_audio(args.clip, 24000)
 
5
 
6
  if __name__ == '__main__':
7
  parser = argparse.ArgumentParser()
8
+ parser.add_argument('--clip', type=str, help='Path to an audio clip to classify.', default="../examples/favorite_riding_hood.mp3")
9
  args = parser.parse_args()
10
 
11
  clip = load_audio(args.clip, 24000)
tortoise/read.py CHANGED
@@ -40,7 +40,7 @@ if __name__ == '__main__':
40
  parser.add_argument('--model_dir', type=str, help='Where to find pretrained model checkpoints. Tortoise automatically downloads these to .models, so this'
41
  'should only be specified if you have custom checkpoints.', default='.models')
42
  args = parser.parse_args()
43
- tts = TextToSpeech(models_dir=args.model_dir, save_random_voices=True)
44
 
45
  outpath = args.output_path
46
  selected_voices = args.voice.split(',')
 
40
  parser.add_argument('--model_dir', type=str, help='Where to find pretrained model checkpoints. Tortoise automatically downloads these to .models, so this'
41
  'should only be specified if you have custom checkpoints.', default='.models')
42
  args = parser.parse_args()
43
+ tts = TextToSpeech(models_dir=args.model_dir)
44
 
45
  outpath = args.output_path
46
  selected_voices = args.voice.split(',')
tortoise/utils/audio.py CHANGED
@@ -114,7 +114,7 @@ def load_voices(voices):
114
  if voice == 'random':
115
  print("Cannot combine a random voice with a non-random voice. Just using a random voice.")
116
  return None, None
117
- latent, clip = load_voice(voice)
118
  if latent is None:
119
  assert len(latents) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
120
  clips.extend(clip)
 
114
  if voice == 'random':
115
  print("Cannot combine a random voice with a non-random voice. Just using a random voice.")
116
  return None, None
117
+ clip, latent = load_voice(voice)
118
  if latent is None:
119
  assert len(latents) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
120
  clips.extend(clip)
tortoise/utils/wav2vec_alignment.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import torch
2
  import torchaudio
3
  from transformers import Wav2Vec2ForCTC, Wav2Vec2FeatureExtractor, Wav2Vec2CTCTokenizer, Wav2Vec2Processor
@@ -11,7 +13,7 @@ class Wav2VecAlignment:
11
  self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(f"facebook/wav2vec2-large-960h")
12
  self.tokenizer = Wav2Vec2CTCTokenizer.from_pretrained('jbetker/tacotron_symbols')
13
 
14
- def align(self, audio, expected_text, audio_sample_rate=24000, topk=3):
15
  orig_len = audio.shape[-1]
16
 
17
  with torch.no_grad():
@@ -41,8 +43,10 @@ class Wav2VecAlignment:
41
 
42
  if len(expected_tokens) > 0:
43
  print(f"Alignment did not work. {len(expected_tokens)} were not found, with the following string un-aligned:"
44
- f" {self.tokenizer.decode(expected_tokens)}")
45
- return None
 
 
46
 
47
  return alignments
48
 
@@ -54,6 +58,8 @@ class Wav2VecAlignment:
54
  for spl in splitted[1:]:
55
  assert ']' in spl, 'Every "[" character must be paired with a "]" with no nesting.'
56
  fully_split.extend(spl.split(']'))
 
 
57
  # At this point, fully_split is a list of strings, with every other string being something that should be redacted.
58
  non_redacted_intervals = []
59
  last_point = 0
@@ -63,20 +69,22 @@ class Wav2VecAlignment:
63
  last_point += len(fully_split[i])
64
 
65
  bare_text = ''.join(fully_split)
66
- alignments = self.align(audio, bare_text, audio_sample_rate, topk)
67
- if alignments is None:
68
- return audio # Cannot redact because alignment did not succeed.
 
 
69
 
70
  output_audio = []
71
  for nri in non_redacted_intervals:
72
  start, stop = nri
73
- output_audio.append(audio[:, alignments[start]:alignments[stop]])
74
  return torch.cat(output_audio, dim=-1)
75
 
76
 
77
  if __name__ == '__main__':
78
- some_audio = load_audio('../../results/favorites/morgan_freeman_metallic_hydrogen.mp3', 24000)
79
  aligner = Wav2VecAlignment()
80
- text = "instead of molten iron, jupiter [and brown dwaves] have hydrogen, which [is under so much pressure that it] develops metallic properties"
81
  redact = aligner.redact(some_audio, text)
82
  torchaudio.save(f'test_output.wav', redact, 24000)
 
1
+ import re
2
+
3
  import torch
4
  import torchaudio
5
  from transformers import Wav2Vec2ForCTC, Wav2Vec2FeatureExtractor, Wav2Vec2CTCTokenizer, Wav2Vec2Processor
 
13
  self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(f"facebook/wav2vec2-large-960h")
14
  self.tokenizer = Wav2Vec2CTCTokenizer.from_pretrained('jbetker/tacotron_symbols')
15
 
16
+ def align(self, audio, expected_text, audio_sample_rate=24000, topk=3, return_partial=False):
17
  orig_len = audio.shape[-1]
18
 
19
  with torch.no_grad():
 
43
 
44
  if len(expected_tokens) > 0:
45
  print(f"Alignment did not work. {len(expected_tokens)} were not found, with the following string un-aligned:"
46
+ f" `{self.tokenizer.decode(expected_tokens)}`. Here's what wav2vec thought it heard:"
47
+ f"`{self.tokenizer.decode(logits.argmax(-1).tolist())}`")
48
+ if not return_partial:
49
+ return None
50
 
51
  return alignments
52
 
 
58
  for spl in splitted[1:]:
59
  assert ']' in spl, 'Every "[" character must be paired with a "]" with no nesting.'
60
  fully_split.extend(spl.split(']'))
61
+ # Remove any non-alphabetic character in the input text. This makes matching more likely.
62
+ fully_split = [re.sub(r'[^a-zA-Z ]', '', s) for s in fully_split]
63
  # At this point, fully_split is a list of strings, with every other string being something that should be redacted.
64
  non_redacted_intervals = []
65
  last_point = 0
 
69
  last_point += len(fully_split[i])
70
 
71
  bare_text = ''.join(fully_split)
72
+ alignments = self.align(audio, bare_text, audio_sample_rate, topk, return_partial=True)
73
+ # If alignment fails, we will attempt to recover by assuming the remaining alignments consume the rest of the string.
74
+ def get_alignment(i):
75
+ if i >= len(alignments):
76
+ return audio.shape[-1]
77
 
78
  output_audio = []
79
  for nri in non_redacted_intervals:
80
  start, stop = nri
81
+ output_audio.append(audio[:, get_alignment(start):get_alignment(stop)])
82
  return torch.cat(output_audio, dim=-1)
83
 
84
 
85
  if __name__ == '__main__':
86
+ some_audio = load_audio('../../results/train_dotrice_0.wav', 24000)
87
  aligner = Wav2VecAlignment()
88
+ text = "[God fucking damn it I'm so angry] The expressiveness of autoregressive transformers is literally nuts! I absolutely adore them."
89
  redact = aligner.redact(some_audio, text)
90
  torchaudio.save(f'test_output.wav', redact, 24000)