File size: 2,261 Bytes
0c20337
49c7767
fbd01f5
 
49c7767
fbd01f5
81a8f5e
e16fd1e
fbd01f5
81a8f5e
fbd01f5
 
 
a647645
e03756e
 
e16fd1e
 
 
 
 
43e8b5c
5165e58
fbd01f5
 
8b02d24
0c5c249
5165e58
fbd01f5
0c5c249
 
fbd01f5
 
0c5c249
 
 
 
 
fbd01f5
 
 
 
0c5c249
fbd01f5
 
 
0c20337
fbd01f5
a0ea8bb
fbd01f5
 
 
 
 
 
 
 
0c5c249
5165e58
fbd01f5
0c20337
fbd01f5
e17e821
0c5c249
0c20337
0c5c249
 
0c20337
0c5c249
fbd01f5
 
 
5ad697d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import os
import torch
import time
import uuid
import gradio as gr
from openvoice import se_extractor
from openvoice.api import ToneColorConverter

# Set writable cache directory for torch
os.environ["TORCH_HOME"] = "/tmp/torch"
os.makedirs("/tmp/torch", exist_ok=True)

# Environment fixes for HF Spaces
os.environ["HF_HOME"] = "/tmp/huggingface"
os.environ["HF_HUB_CACHE"] = "/tmp/huggingface"
os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"
os.environ["MPLCONFIGDIR"] = "/tmp"
os.environ["XDG_CACHE_HOME"] = "/tmp"
os.environ["XDG_CONFIG_HOME"] = "/tmp"
os.environ["NUMBA_DISABLE_CACHE"] = "1"
os.makedirs("/tmp/huggingface", exist_ok=True)
os.makedirs("/tmp/flagged", exist_ok=True)

# Set model paths
ckpt_converter = "checkpoints/converter/config.json"
output_dir = "/tmp/outputs"
os.makedirs(output_dir, exist_ok=True)

# Initialize OpenVoice converter
tone_color_converter = ToneColorConverter(ckpt_converter)

# Speaker embedding cache
ref_speaker_embed = None

def clone_and_speak(text, speaker_wav):
    if not speaker_wav:
        return "Please upload a reference .wav file."

    # Generate a unique filename
    timestamp = str(int(time.time()))
    base_name = f"output_{timestamp}_{uuid.uuid4().hex[:6]}"
    output_wav = os.path.join(output_dir, f"{base_name}.wav")

    # Extract style from uploaded speaker voice
    global ref_speaker_embed
    ref_speaker_embed = se_extractor.get_se(speaker_wav, tone_color_converter, vad=False)

    # Generate speech using base model
    tone_color_converter.convert(
        text=text,
        speaker_id="openvoice",
        language="en",
        ref_speaker=speaker_wav,
        ref_embed=ref_speaker_embed,
        output_path=output_wav,
        top_k=10,
        temperature=0.3
    )

    return output_wav

# Gradio interface (exposed as global `demo` for HF Spaces)
gr.Interface(
    fn=clone_and_speak,
    inputs=[
        gr.Textbox(label="Enter Text"),
        gr.Audio(type="filepath", label="Upload a Reference Voice (.wav)")
    ],
    outputs=gr.Audio(label="Synthesized Output"),
    flagging_dir="/tmp/flagged",  # safe temporary dir
    title="Text to Voice using OpenVoice",
    description="Clone any voice (English) and generate speech using OpenVoice on CPU.",
).launch()