SohomToom's picture
Update app.py
49c7767 verified
raw
history blame
2.67 kB
import os
import torch
# Set a writable cache directory for torch and preload Silero VAD
os.environ["TORCH_HOME"] = "/tmp/torch"
os.makedirs("/tmp/torch", exist_ok=True)
# Preload Silero VAD to avoid runtime issues
try:
torch.hub.load(repo_or_dir="snakers4/silero-vad", model="silero_vad", trust_repo=True)
print("Silero VAD model preloaded successfully.")
except Exception as e:
print(f"Failed to preload Silero VAD: {e}")
# Now proceed with the rest of the imports
from openvoice import se_extractor
from openvoice.api import ToneColorConverter
from openvoice.config import AttrDict
import gradio as gr
import torchaudio
# ... other imports if needed
import time
import uuid
# Environment fixes for HF Spaces
os.environ["HF_HOME"] = "/tmp/huggingface"
os.environ["HF_HUB_CACHE"] = "/tmp/huggingface"
os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"
os.environ["MPLCONFIGDIR"] = "/tmp"
os.environ["XDG_CACHE_HOME"] = "/tmp"
os.environ["XDG_CONFIG_HOME"] = "/tmp"
os.environ["NUMBA_DISABLE_CACHE"] = "1"
os.makedirs("/tmp/huggingface", exist_ok=True)
os.makedirs("/tmp/flagged", exist_ok=True)
# Set model paths
ckpt_converter = "checkpoints/converter/config.json"
output_dir = "/tmp/outputs"
os.makedirs(output_dir, exist_ok=True)
# Initialize OpenVoice converter
tone_color_converter = ToneColorConverter(ckpt_converter)
# Speaker embedding cache
ref_speaker_embed = None
def clone_and_speak(text, speaker_wav):
if not speaker_wav:
return "Please upload a reference .wav file."
# Generate a unique filename
timestamp = str(int(time.time()))
base_name = f"output_{timestamp}_{uuid.uuid4().hex[:6]}"
output_wav = os.path.join(output_dir, f"{base_name}.wav")
# Extract style from uploaded speaker voice
global ref_speaker_embed
ref_speaker_embed = se_extractor.get_se(speaker_wav, tone_color_converter)
# Generate speech using base model
tone_color_converter.infer(
text=text,
speaker_id="openvoice",
language="en",
ref_speaker=speaker_wav,
ref_embed=ref_speaker_embed,
output_path=output_wav,
top_k=10,
temperature=0.3
)
return output_wav
# Gradio interface (exposed as global `demo` for HF Spaces)
gr.Interface(
fn=clone_and_speak,
inputs=[
gr.Textbox(label="Enter Text"),
gr.Audio(type="filepath", label="Upload a Reference Voice (.wav)")
],
outputs=gr.Audio(label="Synthesized Output"),
flagging_dir="/tmp/flagged", # safe temporary dir
title="Text to Voice using OpenVoice",
description="Clone any voice (English) and generate speech using OpenVoice on CPU.",
).launch()