ASR-model / app.py
Najma-Nur's picture
Update app.py
e0613b2 verified
import gradio as gr
from TTS.api import TTS
import numpy as np
import soundfile as sf
# Initialize the TTS model
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
tts = TTS(model_name, gpu=False) # Set gpu=True if CUDA is available
def clone_voice(reference_audio, text):
"""
Generate cloned speech from input voice and text.
"""
# Save the reference audio to a temporary file
ref_audio_path = "reference.wav"
sf.write(ref_audio_path, reference_audio[0], reference_audio[1])
# Generate speech
output_path = "cloned_voice.wav"
tts.tts_to_file(text=text, speaker_wav=ref_audio_path, language="en", file_path=output_path)
# Load and return the generated audio
cloned_audio, sr = sf.read(output_path)
return (sr, cloned_audio)
# Gradio Interface
description = "Upload an audio file of the voice you want to clone and provide text to generate speech in the same voice."
iface = gr.Interface(
fn=clone_voice,
inputs=[gr.Audio(source="upload", type="numpy", label="Reference Audio"), gr.Textbox(label="Text to Speak")],
outputs=gr.Audio(label="Cloned Voice Output"),
title="Real-Time Voice Cloning with XTTS",
description=description
)
if __name__ == "__main__":
iface.launch()