voice-cloning2

Runtime error

App Files Files Community

vettorazi commited on Jun 13, 2023

Commit

855c66b

•

1 Parent(s): 92d8bdb

main.py

Browse files

Files changed (1) hide show

main.py +137 -0

main.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import io
+import json
+import os
+from pathlib import Path
+import librosa
+import numpy as np
+import torch
+import soundfile as sf
+from demucs.apply import apply_model
+from demucs.pretrained import DEFAULT_MODEL, get_model
+from fastapi import FastAPI, UploadFile, File
+from huggingface_hub import hf_hub_download, list_repo_files
+from starlette.responses import StreamingResponse
+from so_vits_svc_fork.hparams import HParams
+from so_vits_svc_fork.inference.core import Svc
+app = FastAPI()
+###################################################################
+# REPLACE THESE VALUES TO CHANGE THE MODEL REPO/CKPT NAME/SETTINGS
+###################################################################
+# The Hugging Face Hub repo ID
+repo_id = "vettorazi/vettorazi"
+# If None, Uses latest ckpt in the repo
+ckpt_name = None
+# If None, Uses "kmeans.pt" if it exists in the repo
+cluster_model_name = None
+# Set the default f0 type to use - use the one it was trained on.
+# The default for so-vits-svc-fork is "dio".
+# Options: "crepe", "crepe-tiny", "parselmouth", "dio", "harvest"
+default_f0_method = "crepe"
+# The default ratio of cluster inference to SVC inference.
+# If cluster_model_name is not found in the repo, this is set to 0.
+default_cluster_infer_ratio = 0.5
+# Limit on duration of audio at inference time. increase if you can
+# In this parent app, we set the limit with an env var to 30 seconds
+# If you didnt set env var + you go OOM try changing 9e9 to <=300ish
+duration_limit = int(os.environ.get("MAX_DURATION_SECONDS", 9e9))
+###################################################################
+if ckpt_name is None:
+    latest_id = sorted(
+        [
+            int(Path(x).stem.split("_")[1])
+            for x in list_repo_files(repo_id)
+            if x.startswith("G_") and x.endswith(".pth")
+        ]
+    )[-1]
+    ckpt_name = f"G_{latest_id}.pth"
+cluster_model_name = cluster_model_name or "kmeans.pt"
+if cluster_model_name in list_repo_files(repo_id):
+    cluster_model_path = hf_hub_download(repo_id, cluster_model_name)
+else:
+    cluster_model_path = None
+default_cluster_infer_ratio = default_cluster_infer_ratio if cluster_model_path else 0
+generator_path = hf_hub_download(repo_id, ckpt_name)
+config_path = hf_hub_download(repo_id, "config.json")
+hparams = HParams(**json.loads(Path(config_path).read_text()))
+speakers = list(hparams.spk.keys())
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model = Svc(net_g_path=generator_path, config_path=config_path, device=device, cluster_model_path=cluster_model_path)
+demucs_model = get_model(DEFAULT_MODEL)
+def predict(
+    speaker,
+    audio,
+    transpose: int = 0,
+    auto_predict_f0: bool = False,
+    cluster_infer_ratio: float = 0,
+    noise_scale: float = 0.4,
+    f0_method: str = "crepe",
+    db_thresh: int = -40,
+    pad_seconds: float = 0.5,
+    chunk_seconds: float = 0.5,
+    absolute_thresh: bool = False,
+):
+    audio, _ = librosa.load(audio, sr=model.target_sample, duration=duration_limit)
+    audio = librosa.util.normalize(audio)
+    out = model.predict(
+        audio,
+        speaker,
+        transpose=transpose,
+        auto_predict_f0=auto_predict_f0,
+        cluster_infer_ratio=cluster_infer_ratio,
+        noise_scale=noise_scale,
+        f0_method=f0_method,
+        db_thresh=db_thresh,
+        pad_seconds=pad_seconds,
+        chunk_seconds=chunk_seconds,
+        absolute_thresh=absolute_thresh,
+    )
+    return model.target_sample, out
+@app.post("/voice_cloning/")
+async def voice_cloning(
+    speaker: str,
+    file: UploadFile = File(...),
+    transpose: int = 0,
+    auto_predict_f0: bool = False,
+    cluster_infer_ratio: float = default_cluster_infer_ratio,
+    noise_scale: float = 0.4,
+    f0_method: str = default_f0_method,
+):
+    # Process the audio file
+    audio_bytes = await file.read()
+    with io.BytesIO(audio_bytes) as audio_io:
+        sample_rate, audio_data = predict(
+            speaker,
+            audio_io,
+            transpose=transpose,
+            auto_predict_f0=auto_predict_f0,
+            cluster_infer_ratio=cluster_infer_ratio,
+            noise_scale=noise_scale,
+            f0_method=f0_method
+        )
+        # Create a BytesIO object to hold the audio data
+        audio_byte_stream = io.BytesIO()
+        # Write the audio data to the BytesIO object
+        sf.write(audio_byte_stream, audio_data, sample_rate, format="wav")
+        # Create a StreamingResponse to return the audio
+        return StreamingResponse(io.BytesIO(audio_byte_stream.getvalue()), media_type="audio/wav")
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)