Spaces:
Runtime error
Runtime error
import io | |
import json | |
import os | |
from pathlib import Path | |
import librosa | |
import numpy as np | |
import torch | |
import soundfile as sf | |
from demucs.apply import apply_model | |
from demucs.pretrained import DEFAULT_MODEL, get_model | |
import gradio as gr | |
from huggingface_hub import hf_hub_download, list_repo_files | |
from so_vits_svc_fork.hparams import HParams | |
from so_vits_svc_fork.inference.core import Svc | |
################################################################### | |
# REPLACE THESE VALUES TO CHANGE THE MODEL REPO/CKPT NAME/SETTINGS | |
################################################################### | |
# The Hugging Face Hub repo ID | |
repo_id = "vettorazi/vettorazi" | |
# If None, Uses latest ckpt in the repo | |
ckpt_name = None | |
# If None, Uses "kmeans.pt" if it exists in the repo | |
cluster_model_name = None | |
# Set the default f0 type to use - use the one it was trained on. | |
# The default for so-vits-svc-fork is "dio". | |
# Options: "crepe", "crepe-tiny", "parselmouth", "dio", "harvest" | |
default_f0_method = "crepe" | |
# The default ratio of cluster inference to SVC inference. | |
# If cluster_model_name is not found in the repo, this is set to 0. | |
default_cluster_infer_ratio = 0.5 | |
# Limit on duration of audio at inference time. increase if you can | |
# In this parent app, we set the limit with an env var to 30 seconds | |
# If you didnt set env var + you go OOM try changing 9e9 to <=300ish | |
duration_limit = int(os.environ.get("MAX_DURATION_SECONDS", 9e9)) | |
################################################################### | |
if ckpt_name is None: | |
latest_id = sorted( | |
[ | |
int(Path(x).stem.split("_")[1]) | |
for x in list_repo_files(repo_id) | |
if x.startswith("G_") and x.endswith(".pth") | |
] | |
)[-1] | |
ckpt_name = f"G_{latest_id}.pth" | |
cluster_model_name = cluster_model_name or "kmeans.pt" | |
if cluster_model_name in list_repo_files(repo_id): | |
cluster_model_path = hf_hub_download(repo_id, cluster_model_name) | |
else: | |
cluster_model_path = None | |
default_cluster_infer_ratio = default_cluster_infer_ratio if cluster_model_path else 0 | |
generator_path = hf_hub_download(repo_id, ckpt_name) | |
config_path = hf_hub_download(repo_id, "config.json") | |
hparams = HParams(**json.loads(Path(config_path).read_text())) | |
speakers = list(hparams.spk.keys()) | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
model = Svc(net_g_path=generator_path, config_path=config_path, device=device, cluster_model_path=cluster_model_path) | |
demucs_model = get_model(DEFAULT_MODEL) | |
def predict( | |
speaker, | |
audio, | |
transpose: int = 0, | |
auto_predict_f0: bool = False, | |
cluster_infer_ratio: float = 0, | |
noise_scale: float = 0.4, | |
f0_method: str = "crepe", | |
db_thresh: int = -40, | |
pad_seconds: float = 0.5, | |
chunk_seconds: float = 0.5, | |
absolute_thresh: bool = False, | |
): | |
audio, _ = librosa.load(audio, sr=model.target_sample, duration=duration_limit) | |
audio = librosa.util.normalize(audio) | |
out = model.predict( | |
audio, | |
speaker, | |
transpose=transpose, | |
auto_predict_f0=auto_predict_f0, | |
cluster_infer_ratio=cluster_infer_ratio, | |
noise_scale=noise_scale, | |
f0_method=f0_method, | |
db_thresh=db_thresh, | |
pad_seconds=pad_seconds, | |
chunk_seconds=chunk_seconds, | |
absolute_thresh=absolute_thresh, | |
) | |
return model.target_sample, out | |
def voice_cloning(speaker, audio): | |
sample_rate, audio_data = predict(speaker, audio) | |
return audio_data, sample_rate | |
# Configure the Gradio interface | |
inputs = [ | |
gr.inputs.Dropdown(choices=speakers, label="Speaker"), | |
gr.inputs.Audio(label="Audio") | |
] | |
outputs = gr.outputs.Audio(label="Cloned Audio") | |
iface = gr.Interface(fn=voice_cloning, inputs=inputs, outputs=outputs) | |
if __name__ == "__main__": | |
iface.launch() | |