File size: 2,671 Bytes
0c20337 49c7767 c9d1f02 49c7767 b4ad83e 49c7767 a59b93c e16fd1e a59b93c a647645 e03756e e16fd1e 43e8b5c 5165e58 0c5c249 509a00f 8b02d24 0c5c249 5165e58 a59b93c 0c5c249 a59b93c 0c5c249 0c20337 a59b93c 0c5c249 5165e58 0c5c249 0c20337 a59b93c e17e821 0c5c249 0c20337 0c5c249 0c20337 0c5c249 a59b93c 0c5c249 5ad697d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
import os
import torch
# Set a writable cache directory for torch and preload Silero VAD
os.environ["TORCH_HOME"] = "/tmp/torch"
os.makedirs("/tmp/torch", exist_ok=True)
# Preload Silero VAD to avoid runtime issues
try:
torch.hub.load(repo_or_dir="snakers4/silero-vad", model="silero_vad", trust_repo=True)
print("Silero VAD model preloaded successfully.")
except Exception as e:
print(f"Failed to preload Silero VAD: {e}")
# Now proceed with the rest of the imports
from openvoice import se_extractor
from openvoice.api import ToneColorConverter
from openvoice.config import AttrDict
import gradio as gr
import torchaudio
# ... other imports if needed
import time
import uuid
# Environment fixes for HF Spaces
os.environ["HF_HOME"] = "/tmp/huggingface"
os.environ["HF_HUB_CACHE"] = "/tmp/huggingface"
os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"
os.environ["MPLCONFIGDIR"] = "/tmp"
os.environ["XDG_CACHE_HOME"] = "/tmp"
os.environ["XDG_CONFIG_HOME"] = "/tmp"
os.environ["NUMBA_DISABLE_CACHE"] = "1"
os.makedirs("/tmp/huggingface", exist_ok=True)
os.makedirs("/tmp/flagged", exist_ok=True)
# Set model paths
ckpt_converter = "checkpoints/converter/config.json"
output_dir = "/tmp/outputs"
os.makedirs(output_dir, exist_ok=True)
# Initialize OpenVoice converter
tone_color_converter = ToneColorConverter(ckpt_converter)
# Speaker embedding cache
ref_speaker_embed = None
def clone_and_speak(text, speaker_wav):
if not speaker_wav:
return "Please upload a reference .wav file."
# Generate a unique filename
timestamp = str(int(time.time()))
base_name = f"output_{timestamp}_{uuid.uuid4().hex[:6]}"
output_wav = os.path.join(output_dir, f"{base_name}.wav")
# Extract style from uploaded speaker voice
global ref_speaker_embed
ref_speaker_embed = se_extractor.get_se(speaker_wav, tone_color_converter)
# Generate speech using base model
tone_color_converter.infer(
text=text,
speaker_id="openvoice",
language="en",
ref_speaker=speaker_wav,
ref_embed=ref_speaker_embed,
output_path=output_wav,
top_k=10,
temperature=0.3
)
return output_wav
# Gradio interface (exposed as global `demo` for HF Spaces)
gr.Interface(
fn=clone_and_speak,
inputs=[
gr.Textbox(label="Enter Text"),
gr.Audio(type="filepath", label="Upload a Reference Voice (.wav)")
],
outputs=gr.Audio(label="Synthesized Output"),
flagging_dir="/tmp/flagged", # safe temporary dir
title="Text to Voice using OpenVoice",
description="Clone any voice (English) and generate speech using OpenVoice on CPU.",
).launch()
|