vettorazi commited on
Commit
4bdcce8
1 Parent(s): 01ba3ca

Delete main.py

Browse files
Files changed (1) hide show
  1. main.py +0 -118
main.py DELETED
@@ -1,118 +0,0 @@
1
- import io
2
- import json
3
- import os
4
- from pathlib import Path
5
-
6
- import librosa
7
- import numpy as np
8
- import torch
9
- import soundfile as sf
10
- from demucs.apply import apply_model
11
- from demucs.pretrained import DEFAULT_MODEL, get_model
12
- import gradio as gr
13
- from huggingface_hub import hf_hub_download, list_repo_files
14
-
15
- from so_vits_svc_fork.hparams import HParams
16
- from so_vits_svc_fork.inference.core import Svc
17
-
18
- ###################################################################
19
- # REPLACE THESE VALUES TO CHANGE THE MODEL REPO/CKPT NAME/SETTINGS
20
- ###################################################################
21
- # The Hugging Face Hub repo ID
22
- repo_id = "vettorazi/vettorazi"
23
-
24
- # If None, Uses latest ckpt in the repo
25
- ckpt_name = None
26
-
27
- # If None, Uses "kmeans.pt" if it exists in the repo
28
- cluster_model_name = None
29
-
30
- # Set the default f0 type to use - use the one it was trained on.
31
- # The default for so-vits-svc-fork is "dio".
32
- # Options: "crepe", "crepe-tiny", "parselmouth", "dio", "harvest"
33
- default_f0_method = "crepe"
34
-
35
- # The default ratio of cluster inference to SVC inference.
36
- # If cluster_model_name is not found in the repo, this is set to 0.
37
- default_cluster_infer_ratio = 0.5
38
-
39
- # Limit on duration of audio at inference time. increase if you can
40
- # In this parent app, we set the limit with an env var to 30 seconds
41
- # If you didnt set env var + you go OOM try changing 9e9 to <=300ish
42
- duration_limit = int(os.environ.get("MAX_DURATION_SECONDS", 9e9))
43
- ###################################################################
44
-
45
- if ckpt_name is None:
46
- latest_id = sorted(
47
- [
48
- int(Path(x).stem.split("_")[1])
49
- for x in list_repo_files(repo_id)
50
- if x.startswith("G_") and x.endswith(".pth")
51
- ]
52
- )[-1]
53
- ckpt_name = f"G_{latest_id}.pth"
54
-
55
- cluster_model_name = cluster_model_name or "kmeans.pt"
56
- if cluster_model_name in list_repo_files(repo_id):
57
- cluster_model_path = hf_hub_download(repo_id, cluster_model_name)
58
- else:
59
- cluster_model_path = None
60
- default_cluster_infer_ratio = default_cluster_infer_ratio if cluster_model_path else 0
61
-
62
- generator_path = hf_hub_download(repo_id, ckpt_name)
63
- config_path = hf_hub_download(repo_id, "config.json")
64
- hparams = HParams(**json.loads(Path(config_path).read_text()))
65
- speakers = list(hparams.spk.keys())
66
- device = "cuda" if torch.cuda.is_available() else "cpu"
67
- model = Svc(net_g_path=generator_path, config_path=config_path, device=device, cluster_model_path=cluster_model_path)
68
- demucs_model = get_model(DEFAULT_MODEL)
69
-
70
-
71
- def predict(
72
- speaker,
73
- audio,
74
- transpose: int = 0,
75
- auto_predict_f0: bool = False,
76
- cluster_infer_ratio: float = 0,
77
- noise_scale: float = 0.4,
78
- f0_method: str = "crepe",
79
- db_thresh: int = -40,
80
- pad_seconds: float = 0.5,
81
- chunk_seconds: float = 0.5,
82
- absolute_thresh: bool = False,
83
- ):
84
- audio, _ = librosa.load(audio, sr=model.target_sample, duration=duration_limit)
85
- audio = librosa.util.normalize(audio)
86
- out = model.predict(
87
- audio,
88
- speaker,
89
- transpose=transpose,
90
- auto_predict_f0=auto_predict_f0,
91
- cluster_infer_ratio=cluster_infer_ratio,
92
- noise_scale=noise_scale,
93
- f0_method=f0_method,
94
- db_thresh=db_thresh,
95
- pad_seconds=pad_seconds,
96
- chunk_seconds=chunk_seconds,
97
- absolute_thresh=absolute_thresh,
98
- )
99
- return model.target_sample, out
100
-
101
-
102
- def voice_cloning(speaker, audio):
103
- sample_rate, audio_data = predict(speaker, audio)
104
- return audio_data, sample_rate
105
-
106
-
107
- # Configure the Gradio interface
108
- inputs = [
109
- gr.inputs.Dropdown(choices=speakers, label="Speaker"),
110
- gr.inputs.Audio(label="Audio")
111
- ]
112
-
113
- outputs = gr.outputs.Audio(label="Cloned Audio")
114
-
115
- iface = gr.Interface(fn=voice_cloning, inputs=inputs, outputs=outputs)
116
-
117
- if __name__ == "__main__":
118
- iface.launch()