File size: 2,425 Bytes
0c20337
81a8f5e
 
49c7767
 
81a8f5e
 
e16fd1e
81a8f5e
 
a647645
e03756e
 
e16fd1e
 
 
 
81a8f5e
e16fd1e
43e8b5c
5165e58
81a8f5e
8b02d24
0c5c249
5165e58
81a8f5e
 
0c5c249
 
81a8f5e
 
0c5c249
 
 
 
 
81a8f5e
 
 
 
 
 
 
 
 
 
 
0c5c249
81a8f5e
 
 
0c20337
81a8f5e
a0ea8bb
81a8f5e
 
 
 
 
0c5c249
5165e58
81a8f5e
0c20337
81a8f5e
e17e821
0c5c249
0c20337
0c5c249
 
0c20337
0c5c249
81a8f5e
 
 
5ad697d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import os
import uuid
import time
import torch
import gradio as gr
from melo.api import TTS
from openvoice.api import ToneColorConverter

# Set temporary cache locations for Hugging Face Spaces
os.environ["TORCH_HOME"] = "/tmp/torch"
os.environ["HF_HOME"] = "/tmp/huggingface"
os.environ["HF_HUB_CACHE"] = "/tmp/huggingface"
os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"
os.environ["MPLCONFIGDIR"] = "/tmp"
os.environ["XDG_CACHE_HOME"] = "/tmp"
os.environ["XDG_CONFIG_HOME"] = "/tmp"
os.environ["NUMBA_DISABLE_CACHE"] = "1"
os.makedirs("/tmp/torch", exist_ok=True)
os.makedirs("/tmp/huggingface", exist_ok=True)
os.makedirs("/tmp/flagged", exist_ok=True)

# Output folder
output_dir = "/tmp/outputs"
os.makedirs(output_dir, exist_ok=True)

# Initialize tone converter
ckpt_converter = "checkpoints/converter/config.json"
tone_color_converter = ToneColorConverter(ckpt_converter)

# Device setting
device = "cuda" if torch.cuda.is_available() else "cpu"

def clone_and_speak(text, speaker_wav):
    if not speaker_wav:
        return "Please upload a reference .wav file."

    base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}"
    tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav"
    final_output_path = f"{output_dir}/{base_name}_converted.wav"

    # Use English speaker model
    model = TTS(language="EN", device=device)
    speaker_ids = model.hps.data.spk2id
    default_speaker_id = next(iter(speaker_ids.values()))

    # Generate base TTS voice
    model.tts_to_file(text, default_speaker_id, tmp_melo_path)

    # Use speaker_wav as reference to extract style embedding
    from openvoice import se_extractor
    ref_se, _ = se_extractor.get_se(speaker_wav, tone_color_converter, vad=False)

    # Run the tone conversion
    tone_color_converter.convert(
        audio_src_path=tmp_melo_path,
        src_se=ref_se,
        tgt_se=ref_se,
        output_path=final_output_path,
        message="@HuggingFace",
    )

    return final_output_path

# Gradio interface
gr.Interface(
    fn=clone_and_speak,
    inputs=[
        gr.Textbox(label="Enter Text"),
        gr.Audio(type="filepath", label="Upload a Reference Voice (.wav)")
    ],
    outputs=gr.Audio(label="Synthesized Output"),
    flagging_dir="/tmp/flagged",
    title="Text to Voice using Melo TTS + OpenVoice",
    description="Use Melo TTS for base synthesis and OpenVoice to apply a reference speaker's tone.",
).launch()