File size: 2,169 Bytes
e16fd1e
0c20337
e16fd1e
 
a647645
e03756e
 
e16fd1e
 
 
 
 
 
 
43e8b5c
e16fd1e
0c5c249
 
0c20337
0c5c249
 
5165e58
0c5c249
509a00f
8b02d24
0c5c249
5165e58
0c5c249
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c20337
0c5c249
 
 
 
 
 
 
 
 
 
 
5165e58
0c5c249
0c20337
5165e58
0c5c249
 
0c20337
0c5c249
 
0c20337
0c5c249
43e8b5c
0c5c249
 
0c20337
5165e58
0c5c249
caa9d86
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76

import os

# Fixes for HF Hub
os.environ["HF_HOME"] = "/tmp/huggingface"
os.environ["HF_HUB_CACHE"] = "/tmp/huggingface"
os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"

# Fixes for matplotlib and fontconfig
os.environ["MPLCONFIGDIR"] = "/tmp"
os.environ["XDG_CACHE_HOME"] = "/tmp"
os.environ["XDG_CONFIG_HOME"] = "/tmp"
os.environ["NUMBA_DISABLE_CACHE"] = "1"
os.makedirs("/tmp/huggingface", exist_ok=True)
os.makedirs("/tmp/flagged", exist_ok=True)
import gradio as gr
from openvoice.api import ToneColorConverter
from openvoice import se_extractor
import torch
import time
import uuid

# Set model paths
ckpt_converter = "checkpoints/converter/config.json"
output_dir = "/tmp/outputs"
os.makedirs(output_dir, exist_ok=True)

# Initialize converter
tone_color_converter = ToneColorConverter(ckpt_converter)

# Load base speaker embedding for style transfer
ref_speaker_embed = None

def clone_and_speak(text, speaker_wav):
    if not speaker_wav:
        return "Please upload a reference .wav file."

    # Generate a unique filename
    timestamp = str(int(time.time()))
    base_name = f"output_{timestamp}_{uuid.uuid4().hex[:6]}"
    output_wav = os.path.join(output_dir, f"{base_name}.wav")

    # Extract style from uploaded speaker voice
    global ref_speaker_embed
    ref_speaker_embed = se_extractor.get_se(speaker_wav, tone_color_converter)

    # Generate speech using base model (internal prompt and sampling)
    tone_color_converter.infer(
        text=text,
        speaker_id="openvoice",
        language="en",
        ref_speaker=speaker_wav,
        ref_embed=ref_speaker_embed,
        output_path=output_wav,
        top_k=10,
        temperature=0.3
    )

    return output_wav


demo = gr.Interface(
    fn=clone_and_speak,
    inputs=[
        gr.Textbox(label="Enter Text"),
        gr.Audio(type="filepath", label="Upload a Reference Voice (.wav)")
    ],
    outputs=gr.Audio(label="Synthesized Output"),
    flagging_dir = "/tmp/flagged",
    title="Text to Voice using OpenVoice",
    description="Clone any voice (English) and generate speech using OpenVoice on CPU.",
)

if __name__ == "__main__":
    demo.launch(debug=True)