|
!pip install -U scipy |
|
!git clone https://github.com/neonbjb/tortoise-tts.git |
|
%cd tortoise-tts |
|
!pip install -r requirements.txt |
|
!python setup.py install |
|
!pip install gradio |
|
|
|
import os |
|
import gradio as gr |
|
import torchaudio |
|
import time |
|
from datetime import datetime |
|
from tortoise.api import TextToSpeech |
|
from tortoise.utils.audio import load_audio, load_voice, load_voices |
|
import os |
|
|
|
|
|
os.environ["COMMANDLINE_ARGS"] = "--no-gradio-queue" |
|
VOICE_OPTIONS = [ |
|
"random", |
|
"custom_voice", |
|
"disabled", |
|
] |
|
|
|
def inference(text, emotion, prompt, voice, mic_audio, voice_b, voice_c, preset, seed): |
|
if voice != "custom_voice": |
|
voices = [voice] |
|
else: |
|
voices = [] |
|
|
|
if voice_b != "disabled": |
|
voices.append(voice_b) |
|
if voice_c != "disabled": |
|
voices.append(voice_c) |
|
|
|
if emotion != "None/Custom": |
|
text = f"[I am really {emotion.lower()},] {text}" |
|
elif prompt.strip() != "": |
|
text = f"[{prompt},] {text}" |
|
|
|
c = None |
|
if voice == "custom_voice": |
|
if mic_audio is None: |
|
raise gr.Error("Please provide audio from mic when choosing custom voice") |
|
c = load_audio(mic_audio, 22050) |
|
|
|
if len(voices) == 1 or len(voices) == 0: |
|
if voice == "custom_voice": |
|
voice_samples, conditioning_latents = [c], None |
|
else: |
|
voice_samples, conditioning_latents = load_voice(voice) |
|
else: |
|
voice_samples, conditioning_latents = load_voices(voices) |
|
if voice == "custom_voice": |
|
voice_samples.extend([c]) |
|
|
|
sample_voice = voice_samples[0] if len(voice_samples) else None |
|
|
|
start_time = time.time() |
|
gen, _ = tts.tts_with_preset( |
|
text, |
|
voice_samples=voice_samples, |
|
conditioning_latents=conditioning_latents, |
|
preset=preset, |
|
use_deterministic_seed=seed, |
|
return_deterministic_state=True, |
|
k=3, |
|
) |
|
|
|
with open("Tortoise_TTS_Runs.log", "a") as f: |
|
f.write( |
|
f"{datetime.now()} | Voice: {','.join(voices)} | Text: {text} | Quality: {preset} | Time Taken (s): {time.time()-start_time} | Seed: {seed}\n" |
|
) |
|
|
|
return ( |
|
(22050, sample_voice.squeeze().cpu().numpy()), |
|
(24000, gen[0].squeeze().cpu().numpy()), |
|
(24000, gen[1].squeeze().cpu().numpy()), |
|
(24000, gen[2].squeeze().cpu().numpy()), |
|
) |
|
|
|
def main(): |
|
|
|
title_html = "<h1 style='text-align: center; color: orange; font-weight: bold;'>RJ VOICE CLONING</h1>" |
|
|
|
|
|
text = gr.Textbox(lines=4, label="Text:") |
|
emotion = gr.Radio( |
|
["None/Custom", "Happy", "Sad", "Angry", "Disgusted", "Arrogant"], |
|
value="None/Custom", |
|
label="Select emotion:", |
|
type="value", |
|
) |
|
prompt = gr.Textbox(lines=1, label="Enter prompt if [Custom] emotion:") |
|
preset = gr.Radio( |
|
["ultra_fast", "fast", "standard", "high_quality"], |
|
value="fast", |
|
label="Preset mode (determines quality with tradeoff over speed):", |
|
type="value", |
|
) |
|
voice = gr.Dropdown( |
|
os.listdir(os.path.join("tortoise", "voices")) + VOICE_OPTIONS, |
|
value="angie", |
|
label="Select voice:", |
|
type="value", |
|
) |
|
mic_audio = gr.Audio( |
|
label="Record voice (when selected custom_voice):", |
|
type="filepath" |
|
) |
|
voice_b = gr.Dropdown( |
|
os.listdir(os.path.join("tortoise", "voices")) + VOICE_OPTIONS, |
|
value="disabled", |
|
label="(Optional) Select second voice:", |
|
type="value", |
|
) |
|
voice_c = gr.Dropdown( |
|
os.listdir(os.path.join("tortoise", "voices")) + VOICE_OPTIONS, |
|
value="disabled", |
|
label="(Optional) Select third voice:", |
|
type="value", |
|
) |
|
seed = gr.Number(value=0, precision=0, label="Seed (for reproducibility):") |
|
|
|
selected_voice = gr.Audio(label="Sample of selected voice (first):") |
|
output_audio_1 = gr.Audio(label="Output [Candidate 1]:") |
|
output_audio_2 = gr.Audio(label="Output [Candidate 2]:") |
|
output_audio_3 = gr.Audio(label="Output [Candidate 3]:") |
|
|
|
|
|
interface = gr.Interface( |
|
fn=inference, |
|
inputs=[text, emotion, prompt, voice, mic_audio, voice_b, voice_c, preset, seed], |
|
outputs=[selected_voice, output_audio_1, output_audio_2, output_audio_3], |
|
title="RJ VOICE CLONING", |
|
description=title_html, |
|
css=".gradio-container { background-color: black; color: orange; }" |
|
) |
|
|
|
|
|
interface.launch(share=True) |
|
|
|
if __name__ == "__main__": |
|
tts = TextToSpeech() |
|
|
|
with open("Tortoise_TTS_Runs.log", "a") as f: |
|
f.write( |
|
f"\n\n-------------------------Tortoise TTS Logs, {datetime.now()}-------------------------\n" |
|
) |
|
|
|
main() |
|
|