SohomToom's picture
Update app.py
fbd01f5 verified
raw
history blame
2.26 kB
import os
import torch
import time
import uuid
import gradio as gr
from openvoice import se_extractor
from openvoice.api import ToneColorConverter
# Set writable cache directory for torch
os.environ["TORCH_HOME"] = "/tmp/torch"
os.makedirs("/tmp/torch", exist_ok=True)
# Environment fixes for HF Spaces
os.environ["HF_HOME"] = "/tmp/huggingface"
os.environ["HF_HUB_CACHE"] = "/tmp/huggingface"
os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"
os.environ["MPLCONFIGDIR"] = "/tmp"
os.environ["XDG_CACHE_HOME"] = "/tmp"
os.environ["XDG_CONFIG_HOME"] = "/tmp"
os.environ["NUMBA_DISABLE_CACHE"] = "1"
os.makedirs("/tmp/huggingface", exist_ok=True)
os.makedirs("/tmp/flagged", exist_ok=True)
# Set model paths
ckpt_converter = "checkpoints/converter/config.json"
output_dir = "/tmp/outputs"
os.makedirs(output_dir, exist_ok=True)
# Initialize OpenVoice converter
tone_color_converter = ToneColorConverter(ckpt_converter)
# Speaker embedding cache
ref_speaker_embed = None
def clone_and_speak(text, speaker_wav):
if not speaker_wav:
return "Please upload a reference .wav file."
# Generate a unique filename
timestamp = str(int(time.time()))
base_name = f"output_{timestamp}_{uuid.uuid4().hex[:6]}"
output_wav = os.path.join(output_dir, f"{base_name}.wav")
# Extract style from uploaded speaker voice
global ref_speaker_embed
ref_speaker_embed = se_extractor.get_se(speaker_wav, tone_color_converter, vad=False)
# Generate speech using base model
tone_color_converter.convert(
text=text,
speaker_id="openvoice",
language="en",
ref_speaker=speaker_wav,
ref_embed=ref_speaker_embed,
output_path=output_wav,
top_k=10,
temperature=0.3
)
return output_wav
# Gradio interface (exposed as global `demo` for HF Spaces)
gr.Interface(
fn=clone_and_speak,
inputs=[
gr.Textbox(label="Enter Text"),
gr.Audio(type="filepath", label="Upload a Reference Voice (.wav)")
],
outputs=gr.Audio(label="Synthesized Output"),
flagging_dir="/tmp/flagged", # safe temporary dir
title="Text to Voice using OpenVoice",
description="Clone any voice (English) and generate speech using OpenVoice on CPU.",
).launch()