File size: 1,264 Bytes
cc72f0f e0613b2 d08e12e e0613b2 d08e12e e0613b2 4e3ae1a d08e12e 4e3ae1a e0613b2 d08e12e e0613b2 d08e12e 4e3ae1a d08e12e a7a3a53 e0613b2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
import gradio as gr
from TTS.api import TTS
import numpy as np
import soundfile as sf
# Initialize the TTS model
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
tts = TTS(model_name, gpu=False) # Set gpu=True if CUDA is available
def clone_voice(reference_audio, text):
"""
Generate cloned speech from input voice and text.
"""
# Save the reference audio to a temporary file
ref_audio_path = "reference.wav"
sf.write(ref_audio_path, reference_audio[0], reference_audio[1])
# Generate speech
output_path = "cloned_voice.wav"
tts.tts_to_file(text=text, speaker_wav=ref_audio_path, language="en", file_path=output_path)
# Load and return the generated audio
cloned_audio, sr = sf.read(output_path)
return (sr, cloned_audio)
# Gradio Interface
description = "Upload an audio file of the voice you want to clone and provide text to generate speech in the same voice."
iface = gr.Interface(
fn=clone_voice,
inputs=[gr.Audio(source="upload", type="numpy", label="Reference Audio"), gr.Textbox(label="Text to Speak")],
outputs=gr.Audio(label="Cloned Voice Output"),
title="Real-Time Voice Cloning with XTTS",
description=description
)
if __name__ == "__main__":
iface.launch()
|