# Text to Speech Playground

In [2]:
import os

import torch
import gradio as gr
from TTS.api import TTS
os.environ["COQUI_TOS_AGREED"] = "1"
# os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"

 from .autonotebook import tqdm as notebook_tqdm


In [3]:
from collections import namedtuple

Voice = namedtuple('voice', ['name', 'neutral','angry'])


In [84]:
voices = [
 Voice('Attenborough', neutral='audio/attenborough/neutral.wav', angry=None),
 Voice('Rick', neutral='audio/rick/neutral.wav', angry=None),
 Voice('Freeman', neutral='audio/freeman/neutral.wav', angry='audio/freeman/angry.wav'),
 Voice('Walken', neutral='audio/walken/neutral.wav', angry=None),
 Voice('Darth Wader', neutral='audio/darth/neutral.wav', angry=None),
]

In [5]:
voices

[voice(name='Attenborough', neutral='audio/attenborough/neutral.mp3', angry=None),
 voice(name='Rick', neutral='audio/rick/neutral.mp3', angry=None),
 voice(name='Freeman', neutral='audio/freeman/neutral.mp3', angry='audio/freeman/angry.mp3'),
 voice(name='Walken', neutral='audio/walken/neutral.mp3', angry=None),
 voice(name='Darth Wader', neutral='audio/darth/neutral.mp3', angry=None)]

In [6]:
#load model for text to speech
device = "cuda" if torch.cuda.is_available() else "cpu"
# device = "mps"
tts_pipelins = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)

 > tts_models/multilingual/multi-dataset/xtts_v2 is already downloaded.


 _torch_pytree._register_pytree_node(
 _torch_pytree._register_pytree_node(
 _torch_pytree._register_pytree_node(


 > Using model: xtts


In [7]:
import IPython


In [81]:
speaker_embedding_cache = {}

In [82]:
def compute_speaker_embedding(voice_path: str, config, pipeline, cache):
 if voice_path not in cache:
 cache[voice_path] = pipeline.synthesizer.tts_model.get_conditioning_latents(
 audio_path=voice_path,
 gpt_cond_len=config.gpt_cond_len,
 gpt_cond_chunk_len=config.gpt_cond_chunk_len,
 max_ref_length=config.max_ref_len,
 sound_norm_refs=config.sound_norm_refs,
 )
 return cache[voice_path]

In [87]:
out = compute_speaker_embedding(voices[0].neutral, tts_pipelins.synthesizer.tts_config, tts_pipelins, speaker_embedding_cache)

In [8]:
out = tts_pipelins.tts(
 "Hello, I am Rick, pickle rick, you took a wrong turn and now you're stuck in a parallel universe",
 speaker_wav="audio/freeman/neutral.wav",
 language="en",
 # file_path="out.wav",
)

 > Text splitted to sentences.
['Hey Petra, so you are hungry?', 'and you like me to prepare some strawberries for you?', 'do you like strawberries?']
 > Processing time: 15.77448582649231
 > Real-time factor: 1.7459813091024587


In [13]:
from typing import List
import time

In [19]:
ref_audio_path = "audio/freeman/neutral.wav"

In [53]:
config.max_ref_len = 360

In [78]:
config = tts_pipelins.synthesizer.tts_config
(gpt_cond_latent, speaker_embedding) = tts_pipelins.synthesizer.tts_model.get_conditioning_latents(
 audio_path=ref_audio_path,
 gpt_cond_len=config.gpt_cond_len,
 gpt_cond_chunk_len=config.gpt_cond_chunk_len,
 max_ref_length=config.max_ref_len,
 sound_norm_refs=config.sound_norm_refs,
)

In [107]:
(gpt_cond_latent, speaker_embedding) = compute_speaker_embedding(voices[0].neutral, tts_pipelins.synthesizer.tts_config, tts_pipelins, speaker_embedding_cache)

In [114]:
import numpy as np

In [116]:
np.array(out)

(205872,)

In [110]:
len(out)

205872

In [128]:
out = tts(
 tts_pipelins.synthesizer,
 "Something is up!",
 # speaker_wav=ref_audio_path,
 language_name="en",
 speaker=None,
 gpt_cond_latent=gpt_cond_latent,
 speaker_embedding=speaker_embedding,
 speed=1.1,
 # file_path="out.wav",
)

 > Text splitted to sentences.
['Something is up!']
 > Processing time: 2.9515581130981445
 > Real-time factor: 1.588292083019672


In [129]:
IPython.display.Audio(out, rate=22050)

In [66]:
from TTS.vocoder.utils.generic_utils import interpolate_vocoder_input

def tts(
 self,
 text: str = "",
 language_name: str = "",
 reference_wav=None,
 gpt_cond_latent=None,
 speaker_embedding=None,
 split_sentences: bool = True,
 **kwargs,
) -> List[int]:
 """🐸 TTS magic. Run all the models and generate speech.

 Args:
 text (str): input text.
 speaker_name (str, optional): speaker id for multi-speaker models. Defaults to "".
 language_name (str, optional): language id for multi-language models. Defaults to "".
 speaker_wav (Union[str, List[str]], optional): path to the speaker wav for voice cloning. Defaults to None.
 style_wav ([type], optional): style waveform for GST. Defaults to None.
 style_text ([type], optional): transcription of style_wav for Capacitron. Defaults to None.
 reference_wav ([type], optional): reference waveform for voice conversion. Defaults to None.
 reference_speaker_name ([type], optional): speaker id of reference waveform. Defaults to None.
 split_sentences (bool, optional): split the input text into sentences. Defaults to True.
 **kwargs: additional arguments to pass to the TTS model.
 Returns:
 List[int]: [description]
 """
 start_time = time.time()
 wavs = []

 if not text and not reference_wav:
 raise ValueError(
 "You need to define either `text` (for sythesis) or a `reference_wav` (for voice conversion) to use the Coqui TTS API."
 )

 if text:
 sens = [text]
 if split_sentences:
 print(" > Text splitted to sentences.")
 sens = self.split_into_sentences(text)
 print(sens)

 if not reference_wav: # not voice conversion
 for sen in sens:
 outputs = self.tts_model.inference(
 sen,
 language_name,
 gpt_cond_latent,
 speaker_embedding,
 # GPT inference
 temperature=0.75,
 length_penalty=1.0,
 repetition_penalty=10.0,
 top_k=50,
 top_p=0.85,
 do_sample=True,
 **kwargs,
 )
 waveform = outputs["wav"]
 if torch.is_tensor(waveform) and waveform.device != torch.device("cpu") and not use_gl:
 waveform = waveform.cpu()
 if not use_gl:
 waveform = waveform.numpy()
 waveform = waveform.squeeze()

 # trim silence
 if "do_trim_silence" in self.tts_config.audio and self.tts_config.audio["do_trim_silence"]:
 waveform = trim_silence(waveform, self.tts_model.ap)

 wavs += list(waveform)
 wavs += [0] * 10000


 # compute stats
 process_time = time.time() - start_time
 audio_time = len(wavs) / self.tts_config.audio["sample_rate"]
 print(f" > Processing time: {process_time}")
 print(f" > Real-time factor: {process_time / audio_time}")
 return wavs

In [None]:
type(tts_pipelins)

In [None]:
IPython.display.Audio(out, rate=22050)

In [None]:
def text_to_speech(voice, tts):
 return voice.neutral

In [None]:
 tts.tts_to_file(text= str(quest_processing[0]),
 file_path="output.wav",
 speaker_wav=f'Audio_Files/{voice}.wav',
 language=quest_processing[3],
 emotion = "angry")

 audio_path = "output.wav"
 return audio_path, state['context'], state

In [90]:
voice_options = []
for voice in voices:
 if voice.neutral:
 voice_options.append(f"{voice.name} - Neutral")
 if voice.angry:
 voice_options.append(f"{voice.name} - Angry")

In [101]:
def voice_from_text(voice):
 for v in voices:
 if voice == f"{v.name} - Neutral":
 return v.neutral
 if voice == f"{v.name} - Angry":
 return v.angry

In [121]:
def tts_gradio(text, voice, state):
 print(text, voice, state)
 voice_path = voice_from_text(voice)
 (gpt_cond_latent, speaker_embedding) = compute_speaker_embedding(voice_path, tts_pipelins.synthesizer.tts_config, tts_pipelins, speaker_embedding_cache)
 out = tts(
 tts_pipelins.synthesizer,
 text,
 language_name="en",
 speaker=None,
 gpt_cond_latent=gpt_cond_latent,
 speaker_embedding=speaker_embedding,
 speed=1.1,
 # file_path="out.wav",
 )
 return (22050, np.array(out)), dict(text=text, voice=voice)

In [122]:
speaker_embedding_cache.keys()

dict_keys(['audio/attenborough/neutral.wav'])

In [127]:
#INTERFACE WITH AUDIO TO AUDIO

#to be able to use the microphone on chrome, you will have to go to chrome://flags/#unsafely-treat-insecure-origin-as-secure and enter http://10.186.115.21:7860/ 
#in "Insecure origins treated as secure", enable it and relaunch chrome


model_answer= ''
general_context= "This is going to be fun, let's enjoy ourselves"
# Define the initial state with some initial context.
print(general_context)
initial_state = {'context': general_context}
initial_context= initial_state['context']
# Create the Gradio interface.
iface = gr.Interface(
 fn=tts_gradio,
 inputs=[
 gr.Textbox(value=initial_context, visible=True, label='Enter the text to be converted to speech', placeholder="This is going to be fun, let's enjoy ourselves", lines=5),
 gr.Radio(choices=voice_options, label='Choose a voice', value=voice_options[0], show_label=True), # Radio button for voice selection
 gr.State() # This will keep track of the context state across interactions.
 ],
 outputs=[
 gr.Audio(label = 'output audio', autoplay=True),
 gr.State()
 ],
 flagging_options=['👎', '👍'],
)
#close all interfaces open to make the port available
gr.close_all()
# Launch the interface.
iface.launch(debug=True, share=False, server_name="0.0.0.0", server_port=7860, ssl_verify=False)

This is going to be fun, let's enjoy ourselves
Closing server running on port: 7860
Closing server running on port: 7860
Closing server running on port: 7860
Closing server running on port: 7860
Closing server running on port: 7860
Closing server running on port: 7860
Closing server running on port: 7860
Closing server running on port: 7860
Closing server running on port: 7860
Closing server running on port: 7860
Closing server running on port: 7860
Running on local URL: http://0.0.0.0:7860

To create a public link, set `share=True` in `launch()`.


This is going to be fun, let's enjoy ourselves Darth Wader - Neutral None
 > Text splitted to sentences.
["This is going to be fun, let's enjoy ourselves"]
 > Processing time: 9.152068138122559
 > Real-time factor: 1.8119083325456329




This is going to be fun, let's enjoy ourselves Darth Wader - Neutral {'text': "This is going to be fun, let's enjoy ourselves", 'voice': 'Darth Wader - Neutral'}
 > Text splitted to sentences.
["This is going to be fun, let's enjoy ourselves"]
 > Processing time: 7.824646234512329
 > Real-time factor: 1.8261372721316347
Keyboard interruption in main thread... closing server.


