import io import json import os from pathlib import Path import librosa import numpy as np import torch import soundfile as sf from demucs.apply import apply_model from demucs.pretrained import DEFAULT_MODEL, get_model import gradio as gr from huggingface_hub import hf_hub_download, list_repo_files from so_vits_svc_fork.hparams import HParams from so_vits_svc_fork.inference.core import Svc ################################################################### # REPLACE THESE VALUES TO CHANGE THE MODEL REPO/CKPT NAME/SETTINGS ################################################################### # The Hugging Face Hub repo ID repo_id = "vettorazi/vettorazi" # If None, Uses latest ckpt in the repo ckpt_name = None # If None, Uses "kmeans.pt" if it exists in the repo cluster_model_name = None # Set the default f0 type to use - use the one it was trained on. # The default for so-vits-svc-fork is "dio". # Options: "crepe", "crepe-tiny", "parselmouth", "dio", "harvest" default_f0_method = "crepe" # The default ratio of cluster inference to SVC inference. # If cluster_model_name is not found in the repo, this is set to 0. default_cluster_infer_ratio = 0.5 # Limit on duration of audio at inference time. increase if you can # In this parent app, we set the limit with an env var to 30 seconds # If you didnt set env var + you go OOM try changing 9e9 to <=300ish duration_limit = int(os.environ.get("MAX_DURATION_SECONDS", 9e9)) ################################################################### if ckpt_name is None: latest_id = sorted( [ int(Path(x).stem.split("_")[1]) for x in list_repo_files(repo_id) if x.startswith("G_") and x.endswith(".pth") ] )[-1] ckpt_name = f"G_{latest_id}.pth" cluster_model_name = cluster_model_name or "kmeans.pt" if cluster_model_name in list_repo_files(repo_id): cluster_model_path = hf_hub_download(repo_id, cluster_model_name) else: cluster_model_path = None default_cluster_infer_ratio = default_cluster_infer_ratio if cluster_model_path else 0 generator_path = hf_hub_download(repo_id, ckpt_name) config_path = hf_hub_download(repo_id, "config.json") hparams = HParams(**json.loads(Path(config_path).read_text())) speakers = list(hparams.spk.keys()) device = "cuda" if torch.cuda.is_available() else "cpu" model = Svc(net_g_path=generator_path, config_path=config_path, device=device, cluster_model_path=cluster_model_path) demucs_model = get_model(DEFAULT_MODEL) def predict( speaker, audio, transpose: int = 0, auto_predict_f0: bool = False, cluster_infer_ratio: float = 0, noise_scale: float = 0.4, f0_method: str = "crepe", db_thresh: int = -40, pad_seconds: float = 0.5, chunk_seconds: float = 0.5, absolute_thresh: bool = False, ): audio, _ = librosa.load(audio, sr=model.target_sample, duration=duration_limit) audio = librosa.util.normalize(audio) out = model.predict( audio, speaker, transpose=transpose, auto_predict_f0=auto_predict_f0, cluster_infer_ratio=cluster_infer_ratio, noise_scale=noise_scale, f0_method=f0_method, db_thresh=db_thresh, pad_seconds=pad_seconds, chunk_seconds=chunk_seconds, absolute_thresh=absolute_thresh, ) return model.target_sample, out def voice_cloning(speaker, audio): sample_rate, audio_data = predict(speaker, audio) return audio_data, sample_rate # Configure the Gradio interface inputs = [ gr.inputs.Dropdown(choices=speakers, label="Speaker"), gr.inputs.Audio(label="Audio") ] outputs = gr.outputs.Audio(label="Cloned Audio") iface = gr.Interface(fn=voice_cloning, inputs=inputs, outputs=outputs) if __name__ == "__main__": iface.launch()