File size: 1,264 Bytes
cc72f0f
 
e0613b2
d08e12e
 
 
e0613b2
 
 
 
 
 
 
 
 
 
 
 
d08e12e
e0613b2
 
 
 
 
 
4e3ae1a
 
d08e12e
4e3ae1a
 
e0613b2
d08e12e
e0613b2
d08e12e
4e3ae1a
 
d08e12e
 
a7a3a53
e0613b2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41

import gradio as gr
from TTS.api import TTS
import numpy as np
import soundfile as sf

# Initialize the TTS model
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
tts = TTS(model_name, gpu=False)  # Set gpu=True if CUDA is available

def clone_voice(reference_audio, text):
    """
    Generate cloned speech from input voice and text.
    """
    # Save the reference audio to a temporary file
    ref_audio_path = "reference.wav"
    sf.write(ref_audio_path, reference_audio[0], reference_audio[1])

    # Generate speech
    output_path = "cloned_voice.wav"
    tts.tts_to_file(text=text, speaker_wav=ref_audio_path, language="en", file_path=output_path)

    # Load and return the generated audio
    cloned_audio, sr = sf.read(output_path)
    return (sr, cloned_audio)

# Gradio Interface
description = "Upload an audio file of the voice you want to clone and provide text to generate speech in the same voice."
iface = gr.Interface(
    fn=clone_voice,
    inputs=[gr.Audio(source="upload", type="numpy", label="Reference Audio"), gr.Textbox(label="Text to Speak")],
    outputs=gr.Audio(label="Cloned Voice Output"),
    title="Real-Time Voice Cloning with XTTS",
    description=description
)

if __name__ == "__main__":
    iface.launch()