File size: 4,268 Bytes
c53363c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import streamlit as st
import torch
import tempfile
import os
import glob
from TTS.api import TTS
import numba

# Disable numba JIT cache for better compatibility
os.environ["NUMBA_DISABLE_CACHE"] = "1"
numba.config.THREADING_LAYER = "workqueue"
numba.config.DISABLE_JIT = True

# Load XTTS model (GPU supported if available)
@st.cache_resource
def load_xtts_model():
    # Check if GPU is available, if not, use CPU
    return TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=torch.cuda.is_available())

# Load model
tts = load_xtts_model()

# UI
st.title("XTTS Voice Cloning Demo")
st.markdown("1. Select a demo voice OR upload your own\n2. Choose or write text\n3. Hear your cloned voice!")

# Load pre-recorded demo voices
demo_voice_dir = "./demo_voices"
demo_files = glob.glob(f"{demo_voice_dir}/*")
demo_names = [os.path.basename(f) for f in demo_files]

# Voice input selection
voice_source = st.radio("Choose voice input method:", ["Use pre-recorded demo voice", "Upload your own voice"])

# Initialize speaker_wav_path
speaker_wav_path = None

if voice_source == "Use pre-recorded demo voice":
    if demo_files:
        selected_demo = st.selectbox("Choose a demo voice:", demo_names)
        speaker_wav_path = os.path.join(demo_voice_dir, selected_demo)
        st.audio(speaker_wav_path, format="audio/wav")
    else:
        st.warning("No demo voices found in 'demo_voices/' folder.")

elif voice_source == "Upload your own voice":
    uploaded_file = st.file_uploader("Upload your voice sample (WAV, mono, 16k–48kHz):", type=["wav"])
    if uploaded_file:
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
            temp_audio.write(uploaded_file.read())
            speaker_wav_path = temp_audio.name
        st.audio(speaker_wav_path, format="audio/wav")

# Hindi Predefined Texts
predefined_texts = {
    "नमस्ते, यह मेरी क्लोन की गई आवाज़ है।": "नमस्ते, यह मेरी क्लोन की गई आवाज़ है।",
    "Hello Everyone, This is my voice cloned using previously recorded voice sample": "Hello Everyone, This is my voice cloned using previously recorded voice sample",
    "मैं आर्टिफिशियल इंटेलिजेंस की मदद से बोल रहा हूँ।": "मैं आर्टिफिशियल इंटेलिजेंस की मदद से बोल रहा हूँ।",
    "यह आवाज़ असली नहीं है, लेकिन क्या आपने फर्क किया?": "यह आवाज़ असली नहीं है, लेकिन क्या आपने फर्क किया?",
    "This is not my real voice, but can you tell the difference":"This is not my real voice, but can you tell the difference",
    "जीवन एक सुंदर यात्रा है, हर पल को जीओ।": "जीवन एक सुंदर यात्रा है, हर पल को जीओ।",
    "Use custom text": "custom"
}

# Text selection for synthesis
selected_text = st.selectbox("Choose or write text to synthesize:", list(predefined_texts.keys()))
if predefined_texts[selected_text] == "custom":
    input_text = st.text_area("Enter custom text:", "Hello, how are you?")
else:
    input_text = predefined_texts[selected_text]

# Clone & Synthesize functionality
if speaker_wav_path and input_text.strip():
    if st.button("🎧 Clone & Synthesize"):
        with st.spinner("Cloning voice..."):
            output_path = "xtts_output.wav"

            # Clone and synthesize the voice using XTTS model
            tts.tts_to_file(
                text=input_text,
                speaker_wav=speaker_wav_path,
                language="en",  # Language set as 'en' for English (adjust as needed)
                file_path=output_path
            )

            # Display the cloned audio
            st.success("Done! Here's your cloned voice:")
            st.audio(output_path, format="audio/wav")

        # Clean up temp file if uploaded
        if voice_source == "Upload your own voice":
            os.remove(speaker_wav_path)