|
|
|
import gradio as gr |
|
from TTS.api import TTS |
|
import numpy as np |
|
import soundfile as sf |
|
|
|
|
|
model_name = "tts_models/multilingual/multi-dataset/xtts_v2" |
|
tts = TTS(model_name, gpu=False) |
|
|
|
def clone_voice(reference_audio, text): |
|
""" |
|
Generate cloned speech from input voice and text. |
|
""" |
|
|
|
ref_audio_path = "reference.wav" |
|
sf.write(ref_audio_path, reference_audio[0], reference_audio[1]) |
|
|
|
|
|
output_path = "cloned_voice.wav" |
|
tts.tts_to_file(text=text, speaker_wav=ref_audio_path, language="en", file_path=output_path) |
|
|
|
|
|
cloned_audio, sr = sf.read(output_path) |
|
return (sr, cloned_audio) |
|
|
|
|
|
description = "Upload an audio file of the voice you want to clone and provide text to generate speech in the same voice." |
|
iface = gr.Interface( |
|
fn=clone_voice, |
|
inputs=[gr.Audio(source="upload", type="numpy", label="Reference Audio"), gr.Textbox(label="Text to Speak")], |
|
outputs=gr.Audio(label="Cloned Voice Output"), |
|
title="Real-Time Voice Cloning with XTTS", |
|
description=description |
|
) |
|
|
|
if __name__ == "__main__": |
|
iface.launch() |
|
|
|
|
|
|