import io
import json
import os
from pathlib import Path

import librosa
import numpy as np
import torch
import soundfile as sf
from demucs.apply import apply_model
from demucs.pretrained import DEFAULT_MODEL, get_model
import gradio as gr
from huggingface_hub import hf_hub_download, list_repo_files

from so_vits_svc_fork.hparams import HParams
from so_vits_svc_fork.inference.core import Svc

###################################################################
# REPLACE THESE VALUES TO CHANGE THE MODEL REPO/CKPT NAME/SETTINGS
###################################################################
# The Hugging Face Hub repo ID
repo_id = "vettorazi/vettorazi"

# If None, Uses latest ckpt in the repo
ckpt_name = None

# If None, Uses "kmeans.pt" if it exists in the repo
cluster_model_name = None

# Set the default f0 type to use - use the one it was trained on.
# The default for so-vits-svc-fork is "dio".
# Options: "crepe", "crepe-tiny", "parselmouth", "dio", "harvest"
default_f0_method = "crepe"

# The default ratio of cluster inference to SVC inference.
# If cluster_model_name is not found in the repo, this is set to 0.
default_cluster_infer_ratio = 0.5

# Limit on duration of audio at inference time. increase if you can
# In this parent app, we set the limit with an env var to 30 seconds
# If you didnt set env var + you go OOM try changing 9e9 to <=300ish
duration_limit = int(os.environ.get("MAX_DURATION_SECONDS", 9e9))
###################################################################

if ckpt_name is None:
    latest_id = sorted(
        [
            int(Path(x).stem.split("_")[1])
            for x in list_repo_files(repo_id)
            if x.startswith("G_") and x.endswith(".pth")
        ]
    )[-1]
    ckpt_name = f"G_{latest_id}.pth"

cluster_model_name = cluster_model_name or "kmeans.pt"
if cluster_model_name in list_repo_files(repo_id):
    cluster_model_path = hf_hub_download(repo_id, cluster_model_name)
else:
    cluster_model_path = None
default_cluster_infer_ratio = default_cluster_infer_ratio if cluster_model_path else 0

generator_path = hf_hub_download(repo_id, ckpt_name)
config_path = hf_hub_download(repo_id, "config.json")
hparams = HParams(**json.loads(Path(config_path).read_text()))
speakers = list(hparams.spk.keys())
device = "cuda" if torch.cuda.is_available() else "cpu"
model = Svc(net_g_path=generator_path, config_path=config_path, device=device, cluster_model_path=cluster_model_path)
demucs_model = get_model(DEFAULT_MODEL)


def predict(
    speaker,
    audio,
    transpose: int = 0,
    auto_predict_f0: bool = False,
    cluster_infer_ratio: float = 0,
    noise_scale: float = 0.4,
    f0_method: str = "crepe",
    db_thresh: int = -40,
    pad_seconds: float = 0.5,
    chunk_seconds: float = 0.5,
    absolute_thresh: bool = False,
):
    audio, _ = librosa.load(audio, sr=model.target_sample, duration=duration_limit)
    audio = librosa.util.normalize(audio)
    out = model.predict(
        audio,
        speaker,
        transpose=transpose,
        auto_predict_f0=auto_predict_f0,
        cluster_infer_ratio=cluster_infer_ratio,
        noise_scale=noise_scale,
        f0_method=f0_method,
        db_thresh=db_thresh,
        pad_seconds=pad_seconds,
        chunk_seconds=chunk_seconds,
        absolute_thresh=absolute_thresh,
    )
    return model.target_sample, out


def voice_cloning(speaker, audio):
    sample_rate, audio_data = predict(speaker, audio)
    return audio_data, sample_rate


# Configure the Gradio interface
inputs = [
    gr.inputs.Dropdown(choices=speakers, label="Speaker"),
    gr.inputs.Audio(label="Audio")
]

outputs = gr.outputs.Audio(label="Cloned Audio")

iface = gr.Interface(fn=voice_cloning, inputs=inputs, outputs=outputs)

if __name__ == "__main__":
    iface.launch()