File size: 5,626 Bytes
7712bf9
6711545
7712bf9
 
0c20337
ab0bdb4
16c7cf3
 
49c7767
16c7cf3
b5485c0
 
16c7cf3
0856e34
5e9b992
93c38a4
1fbf0a3
ab0bdb4
16c7cf3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ab0bdb4
 
16c7cf3
53a7adb
5e9b992
16c7cf3
 
 
0c5c249
5e9b992
 
 
cbb34e3
 
 
16c7cf3
ed1a5ad
 
5e9b992
 
 
 
ed1a5ad
 
8e6abd8
ed1a5ad
c1e585c
6d77b5b
5e9b992
 
cbb34e3
 
 
5e9b992
689f7db
5e9b992
cbb34e3
5e9b992
 
689f7db
 
 
5e9b992
0c20337
16c7cf3
c1e585c
ed1a5ad
5e9b992
ed1a5ad
 
16c7cf3
0c5c249
5165e58
ed1a5ad
0c20337
cbb34e3
 
 
 
 
 
 
 
 
 
 
878264e
41260bf
 
 
 
 
 
 
 
 
 
 
 
16c7cf3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ab0bdb4
16c7cf3
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import nltk
nltk.download('all')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
import os
import uuid
import time
import torch
import gradio as gr
os.environ["NUMBA_DISABLE_CACHE"] = "1"
# import mecab_patch
# import english_patch
#from melo.api import TTS
from MeloTTS.melo.api import TTS
from openvoice import se_extractor
from openvoice.api import ToneColorConverter
#from meloTTS import english

# Set temporary cache locations for Hugging Face Spaces
os.environ["TORCH_HOME"] = "/tmp/torch"
os.environ["HF_HOME"] = "/tmp/huggingface"
os.environ["HF_HUB_CACHE"] = "/tmp/huggingface"
os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"
os.environ["MPLCONFIGDIR"] = "/tmp"
os.environ["XDG_CACHE_HOME"] = "/tmp"
os.environ["XDG_CONFIG_HOME"] = "/tmp"
os.environ["NUMBA_DISABLE_CACHE"] = "1"
os.makedirs("/tmp/torch", exist_ok=True)
os.makedirs("/tmp/huggingface", exist_ok=True)
os.makedirs("/tmp/flagged", exist_ok=True)

# Output folder
output_dir = "/tmp/outputs"
os.makedirs(output_dir, exist_ok=True)

# Initialize tone converter
ckpt_converter = "checkpoint/converter"


# Device setting
device = "cuda" if torch.cuda.is_available() else "cpu"

tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')

def clone_and_speak(text, speaker_wav):
    if not speaker_wav:
        return "Please upload a reference .wav file."

    base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}"
    tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav"
    
   

    ref_se, _ = se_extractor.get_se(speaker_wav, tone_color_converter, vad=True)

    # Use English speaker model
    model = TTS(language="EN", device=device)
    speaker_ids = model.hps.data.spk2id
    default_speaker_id = next(iter(speaker_ids.values()))

    

    for speaker_key in speaker_ids.keys():
        speaker_id = speaker_ids[speaker_key]
        speaker_key = speaker_key.lower().replace('_', '-')
        source_se = torch.load(f'checkpoint/base_speakers/ses/{speaker_key}.pth', map_location=device)
    
    speed = 1.0
    # Use speaker_wav as reference to extract style embedding
    #torch.hub.load('snakers4/silero-vad', 'silero_vad', force_reload=False)
    
    if torch.backends.mps.is_available() and device == 'cpu':
            torch.backends.mps.is_available = lambda: False
    model.tts_to_file(text, speaker_id, tmp_melo_path,speed=speed)
    final_output_path = f"{output_dir}/{base_name}_converted.wav"

    # Run the tone conversion
    tone_color_converter.convert(
        audio_src_path=tmp_melo_path,
        src_se=source_se,
        tgt_se=ref_se,
        output_path=final_output_path,
        message="@HuggingFace",
    )

    return final_output_path

gr.Interface(
    fn=clone_and_speak,
    inputs=[
        gr.Textbox(label="Enter Text"),
        gr.Audio(type="filepath", label="Upload a Reference Voice (.wav)")
    ],
    outputs=gr.Audio(label="Synthesized Output"),
    flagging_dir="/tmp/flagged",
    title="Text to Voice using Melo TTS + OpenVoice",
    description="Use Melo TTS for base synthesis and OpenVoice to apply a reference speaker's tone.",
).launch()

# iface = gr.Interface(
#     fn=clone_with_base_speaker,
#     inputs=[
#         gr.Textbox(label="Input Text", placeholder="Enter text to synthesize..."),
#         gr.Dropdown(choices=base_speaker_choices, label="Select Base Speaker"),
#     ],
#     outputs=gr.Audio(type="filepath", label="Cloned Voice Output"),
#     title="Voice Cloning with OpenVoice Base Speakers",
#     description="Choose a base speaker from OpenVoice and enter text to generate voice."
# )

# iface.launch()


# import os
# import time
# import uuid
# import gradio as gr

# from TTS.api import TTS
# from openvoice import se_extractor
# from openvoice.api import ToneColorConverter

# # Import your local english.py logic
# from meloTTS import english

# # Paths
# device = "cuda" if os.system("nvidia-smi") == 0 else "cpu"
# output_dir = "outputs"
# os.makedirs(output_dir, exist_ok=True)

# # Load OpenVoice tone converter
# tone_color_converter = ToneColorConverter(f"{os.getcwd()}/checkpoints", device=device)
# tone_color_converter.load_model()

# def clone_and_speak(text, speaker_wav):
#     if not speaker_wav:
#         return "Please upload a reference .wav file."

#     base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}"
#     tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav"
#     final_output_path = f"{output_dir}/{base_name}_converted.wav"

#     # Use English speaker model
#     model = TTS(language="EN", device=device)
#     speaker_ids = model.hps.data.spk2id
#     default_speaker_id = next(iter(speaker_ids.values()))

#     # Generate base TTS voice
#     model.tts_to_file(text, speaker_id=default_speaker_id, file_path=tmp_melo_path, speed=1.0)

#     # Extract style embedding
#     ref_se, _ = se_extractor.get_se(speaker_wav, tone_color_converter, vad=False)

#     # Convert tone
#     tone_color_converter.convert(
#         audio_src_path=tmp_melo_path,
#         src_se=ref_se,
#         tgt_se=ref_se,
#         output_path=final_output_path,
#         message="@HuggingFace"
#     )

#     return final_output_path

# # Gradio Interface
# demo = gr.Interface(
#     fn=clone_and_speak,
#     inputs=[
#         gr.Textbox(label="Text to Synthesize"),
#         gr.Audio(label="Reference Voice (WAV)", type="filepath")
#     ],
#     outputs=gr.Audio(label="Cloned Voice Output"),
#     title="Voice Cloner with MeloTTS + OpenVoice"
# )

# if __name__ == "__main__":
#     demo.launch()