File size: 2,261 Bytes
0c20337 49c7767 fbd01f5 49c7767 fbd01f5 81a8f5e e16fd1e fbd01f5 81a8f5e fbd01f5 a647645 e03756e e16fd1e 43e8b5c 5165e58 fbd01f5 8b02d24 0c5c249 5165e58 fbd01f5 0c5c249 fbd01f5 0c5c249 fbd01f5 0c5c249 fbd01f5 0c20337 fbd01f5 a0ea8bb fbd01f5 0c5c249 5165e58 fbd01f5 0c20337 fbd01f5 e17e821 0c5c249 0c20337 0c5c249 0c20337 0c5c249 fbd01f5 5ad697d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
import os
import torch
import time
import uuid
import gradio as gr
from openvoice import se_extractor
from openvoice.api import ToneColorConverter
# Set writable cache directory for torch
os.environ["TORCH_HOME"] = "/tmp/torch"
os.makedirs("/tmp/torch", exist_ok=True)
# Environment fixes for HF Spaces
os.environ["HF_HOME"] = "/tmp/huggingface"
os.environ["HF_HUB_CACHE"] = "/tmp/huggingface"
os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"
os.environ["MPLCONFIGDIR"] = "/tmp"
os.environ["XDG_CACHE_HOME"] = "/tmp"
os.environ["XDG_CONFIG_HOME"] = "/tmp"
os.environ["NUMBA_DISABLE_CACHE"] = "1"
os.makedirs("/tmp/huggingface", exist_ok=True)
os.makedirs("/tmp/flagged", exist_ok=True)
# Set model paths
ckpt_converter = "checkpoints/converter/config.json"
output_dir = "/tmp/outputs"
os.makedirs(output_dir, exist_ok=True)
# Initialize OpenVoice converter
tone_color_converter = ToneColorConverter(ckpt_converter)
# Speaker embedding cache
ref_speaker_embed = None
def clone_and_speak(text, speaker_wav):
if not speaker_wav:
return "Please upload a reference .wav file."
# Generate a unique filename
timestamp = str(int(time.time()))
base_name = f"output_{timestamp}_{uuid.uuid4().hex[:6]}"
output_wav = os.path.join(output_dir, f"{base_name}.wav")
# Extract style from uploaded speaker voice
global ref_speaker_embed
ref_speaker_embed = se_extractor.get_se(speaker_wav, tone_color_converter, vad=False)
# Generate speech using base model
tone_color_converter.convert(
text=text,
speaker_id="openvoice",
language="en",
ref_speaker=speaker_wav,
ref_embed=ref_speaker_embed,
output_path=output_wav,
top_k=10,
temperature=0.3
)
return output_wav
# Gradio interface (exposed as global `demo` for HF Spaces)
gr.Interface(
fn=clone_and_speak,
inputs=[
gr.Textbox(label="Enter Text"),
gr.Audio(type="filepath", label="Upload a Reference Voice (.wav)")
],
outputs=gr.Audio(label="Synthesized Output"),
flagging_dir="/tmp/flagged", # safe temporary dir
title="Text to Voice using OpenVoice",
description="Clone any voice (English) and generate speech using OpenVoice on CPU.",
).launch()
|