vettorazi commited on
Commit
855c66b
1 Parent(s): 92d8bdb
Files changed (1) hide show
  1. main.py +137 -0
main.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import json
3
+ import os
4
+ from pathlib import Path
5
+
6
+ import librosa
7
+ import numpy as np
8
+ import torch
9
+ import soundfile as sf
10
+ from demucs.apply import apply_model
11
+ from demucs.pretrained import DEFAULT_MODEL, get_model
12
+ from fastapi import FastAPI, UploadFile, File
13
+ from huggingface_hub import hf_hub_download, list_repo_files
14
+ from starlette.responses import StreamingResponse
15
+
16
+ from so_vits_svc_fork.hparams import HParams
17
+ from so_vits_svc_fork.inference.core import Svc
18
+
19
+ app = FastAPI()
20
+
21
+ ###################################################################
22
+ # REPLACE THESE VALUES TO CHANGE THE MODEL REPO/CKPT NAME/SETTINGS
23
+ ###################################################################
24
+ # The Hugging Face Hub repo ID
25
+ repo_id = "vettorazi/vettorazi"
26
+
27
+ # If None, Uses latest ckpt in the repo
28
+ ckpt_name = None
29
+
30
+ # If None, Uses "kmeans.pt" if it exists in the repo
31
+ cluster_model_name = None
32
+
33
+ # Set the default f0 type to use - use the one it was trained on.
34
+ # The default for so-vits-svc-fork is "dio".
35
+ # Options: "crepe", "crepe-tiny", "parselmouth", "dio", "harvest"
36
+ default_f0_method = "crepe"
37
+
38
+ # The default ratio of cluster inference to SVC inference.
39
+ # If cluster_model_name is not found in the repo, this is set to 0.
40
+ default_cluster_infer_ratio = 0.5
41
+
42
+ # Limit on duration of audio at inference time. increase if you can
43
+ # In this parent app, we set the limit with an env var to 30 seconds
44
+ # If you didnt set env var + you go OOM try changing 9e9 to <=300ish
45
+ duration_limit = int(os.environ.get("MAX_DURATION_SECONDS", 9e9))
46
+ ###################################################################
47
+
48
+ if ckpt_name is None:
49
+ latest_id = sorted(
50
+ [
51
+ int(Path(x).stem.split("_")[1])
52
+ for x in list_repo_files(repo_id)
53
+ if x.startswith("G_") and x.endswith(".pth")
54
+ ]
55
+ )[-1]
56
+ ckpt_name = f"G_{latest_id}.pth"
57
+
58
+ cluster_model_name = cluster_model_name or "kmeans.pt"
59
+ if cluster_model_name in list_repo_files(repo_id):
60
+ cluster_model_path = hf_hub_download(repo_id, cluster_model_name)
61
+ else:
62
+ cluster_model_path = None
63
+ default_cluster_infer_ratio = default_cluster_infer_ratio if cluster_model_path else 0
64
+
65
+ generator_path = hf_hub_download(repo_id, ckpt_name)
66
+ config_path = hf_hub_download(repo_id, "config.json")
67
+ hparams = HParams(**json.loads(Path(config_path).read_text()))
68
+ speakers = list(hparams.spk.keys())
69
+ device = "cuda" if torch.cuda.is_available() else "cpu"
70
+ model = Svc(net_g_path=generator_path, config_path=config_path, device=device, cluster_model_path=cluster_model_path)
71
+ demucs_model = get_model(DEFAULT_MODEL)
72
+
73
+
74
+ def predict(
75
+ speaker,
76
+ audio,
77
+ transpose: int = 0,
78
+ auto_predict_f0: bool = False,
79
+ cluster_infer_ratio: float = 0,
80
+ noise_scale: float = 0.4,
81
+ f0_method: str = "crepe",
82
+ db_thresh: int = -40,
83
+ pad_seconds: float = 0.5,
84
+ chunk_seconds: float = 0.5,
85
+ absolute_thresh: bool = False,
86
+ ):
87
+ audio, _ = librosa.load(audio, sr=model.target_sample, duration=duration_limit)
88
+ audio = librosa.util.normalize(audio)
89
+ out = model.predict(
90
+ audio,
91
+ speaker,
92
+ transpose=transpose,
93
+ auto_predict_f0=auto_predict_f0,
94
+ cluster_infer_ratio=cluster_infer_ratio,
95
+ noise_scale=noise_scale,
96
+ f0_method=f0_method,
97
+ db_thresh=db_thresh,
98
+ pad_seconds=pad_seconds,
99
+ chunk_seconds=chunk_seconds,
100
+ absolute_thresh=absolute_thresh,
101
+ )
102
+ return model.target_sample, out
103
+
104
+
105
+ @app.post("/voice_cloning/")
106
+ async def voice_cloning(
107
+ speaker: str,
108
+ file: UploadFile = File(...),
109
+ transpose: int = 0,
110
+ auto_predict_f0: bool = False,
111
+ cluster_infer_ratio: float = default_cluster_infer_ratio,
112
+ noise_scale: float = 0.4,
113
+ f0_method: str = default_f0_method,
114
+ ):
115
+ # Process the audio file
116
+ audio_bytes = await file.read()
117
+ with io.BytesIO(audio_bytes) as audio_io:
118
+ sample_rate, audio_data = predict(
119
+ speaker,
120
+ audio_io,
121
+ transpose=transpose,
122
+ auto_predict_f0=auto_predict_f0,
123
+ cluster_infer_ratio=cluster_infer_ratio,
124
+ noise_scale=noise_scale,
125
+ f0_method=f0_method
126
+ )
127
+ # Create a BytesIO object to hold the audio data
128
+ audio_byte_stream = io.BytesIO()
129
+ # Write the audio data to the BytesIO object
130
+ sf.write(audio_byte_stream, audio_data, sample_rate, format="wav")
131
+ # Create a StreamingResponse to return the audio
132
+ return StreamingResponse(io.BytesIO(audio_byte_stream.getvalue()), media_type="audio/wav")
133
+
134
+
135
+ if __name__ == "__main__":
136
+ import uvicorn
137
+ uvicorn.run(app, host="0.0.0.0", port=8000)