jbetker's picture
Add support for extracting and feeding conditioning latents directly into the model
0ffc191
raw
history blame
1.75 kB
import argparse
import os
import torchaudio
from api import TextToSpeech
from tortoise.utils.audio import load_audio, get_voices, load_voice
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--text', type=str, help='Text to speak.', default="I am a language model that has learned to speak.")
parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='pat')
parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='standard')
parser.add_argument('--voice_diversity_intelligibility_slider', type=float,
help='How to balance vocal diversity with the quality/intelligibility of the spoken text. 0 means highly diverse voice (not recommended), 1 means maximize intellibility',
default=.5)
parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/')
args = parser.parse_args()
os.makedirs(args.output_path, exist_ok=True)
tts = TextToSpeech()
selected_voices = args.voice.split(',')
for voice in selected_voices:
voice_samples, conditioning_latents = load_voice(voice)
gen = tts.tts_with_preset(args.text, voice_samples=voice_samples, conditioning_latents=conditioning_latents,
preset=args.preset, clvp_cvvp_slider=args.voice_diversity_intelligibility_slider)
torchaudio.save(os.path.join(args.output_path, f'{voice}.wav'), gen.squeeze(0).cpu(), 24000)