NeoPy commited on
Commit
8150bbe
·
verified ·
1 Parent(s): 05aac64

Upload inference.py

Browse files
Files changed (1) hide show
  1. RVC/inference.py +284 -0
RVC/inference.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import torch
4
+ import librosa
5
+ import logging
6
+ import warnings
7
+
8
+ import numpy as np
9
+ import soundfile as sf
10
+
11
+ warnings.filterwarnings("ignore")
12
+ sys.path.append(os.getcwd())
13
+
14
+ from modules import fairseq
15
+ from modules.config import Config
16
+ from modules.cut import cut, restore
17
+ from modules.pipeline import Pipeline
18
+ from modules.utils import clear_gpu_cache
19
+ from modules.synthesizers import Synthesizer
20
+ from modules.utils import check_predictors, check_embedders, load_audio
21
+
22
+ for l in ["torch", "faiss", "omegaconf", "httpx", "httpcore", "faiss.loader", "numba.core", "urllib3", "transformers", "matplotlib"]:
23
+ logging.getLogger(l).setLevel(logging.ERROR)
24
+
25
+ def run_inference_script(
26
+ is_half=False,
27
+ cpu_mode=False,
28
+ pitch=0,
29
+ filter_radius=3,
30
+ index_rate=0.5,
31
+ volume_envelope=1,
32
+ protect=0.5,
33
+ hop_length=64,
34
+ f0_method="rmvpe",
35
+ input_path=None,
36
+ output_path="./output.wav",
37
+ pth_path=None,
38
+ index_path=None,
39
+ export_format="wav",
40
+ embedder_model="contentvec_base",
41
+ resample_sr=0,
42
+ f0_autotune=False,
43
+ f0_autotune_strength=1,
44
+ split_audio=False,
45
+ clean_audio=False,
46
+ clean_strength=0.7
47
+ ):
48
+ check_predictors(f0_method); check_embedders(embedder_model)
49
+
50
+ if not pth_path or not os.path.exists(pth_path) or os.path.isdir(pth_path) or not pth_path.endswith(".pth"):
51
+ print("[WARNING] Please enter a valid model.")
52
+ return
53
+
54
+ config = Config(is_half=is_half, cpu_mode=cpu_mode)
55
+ cvt = VoiceConverter(config, pth_path, 0)
56
+
57
+ if os.path.isdir(input_path):
58
+ print("[INFO] Use batch conversion...")
59
+ audio_files = [f for f in os.listdir(input_path) if f.lower().endswith(("wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"))]
60
+
61
+ if not audio_files:
62
+ print("[WARNING] No audio files found.")
63
+ return
64
+
65
+ print(f"[INFO] Found {len(audio_files)} audio files for conversion.")
66
+
67
+ for audio in audio_files:
68
+ audio_path = os.path.join(input_path, audio)
69
+ output_audio = os.path.join(input_path, os.path.splitext(audio)[0] + f"_output.{export_format}")
70
+
71
+ print(f"[INFO] Conversion '{audio_path}'...")
72
+ if os.path.exists(output_audio): os.remove(output_audio)
73
+
74
+ cvt.convert_audio(
75
+ audio_input_path=audio_path,
76
+ audio_output_path=output_audio,
77
+ index_path=index_path,
78
+ embedder_model=embedder_model,
79
+ pitch=pitch,
80
+ f0_method=f0_method,
81
+ index_rate=index_rate,
82
+ volume_envelope=volume_envelope,
83
+ protect=protect,
84
+ hop_length=hop_length,
85
+ filter_radius=filter_radius,
86
+ export_format=export_format,
87
+ resample_sr=resample_sr,
88
+ f0_autotune=f0_autotune,
89
+ f0_autotune_strength=f0_autotune_strength,
90
+ split_audio=split_audio,
91
+ clean_audio=clean_audio,
92
+ clean_strength=clean_strength
93
+ )
94
+
95
+ print("[INFO] Conversion complete.")
96
+ else:
97
+ if not os.path.exists(input_path):
98
+ print("[WARNING] No audio files found.")
99
+ return
100
+
101
+ print(f"[INFO] Conversion '{input_path}'...")
102
+ if os.path.exists(output_path): os.remove(output_path)
103
+
104
+ cvt.convert_audio(
105
+ audio_input_path=input_path,
106
+ audio_output_path=output_path,
107
+ index_path=index_path,
108
+ embedder_model=embedder_model,
109
+ pitch=pitch,
110
+ f0_method=f0_method,
111
+ index_rate=index_rate,
112
+ volume_envelope=volume_envelope,
113
+ protect=protect,
114
+ hop_length=hop_length,
115
+ filter_radius=filter_radius,
116
+ export_format=export_format,
117
+ resample_sr=resample_sr,
118
+ f0_autotune=f0_autotune,
119
+ f0_autotune_strength=f0_autotune_strength,
120
+ split_audio=split_audio,
121
+ clean_audio=clean_audio,
122
+ clean_strength=clean_strength
123
+ )
124
+
125
+ print("[INFO] Conversion complete.")
126
+
127
+ class VoiceConverter:
128
+ def __init__(self, config, model_path, sid = 0):
129
+ self.config = config
130
+ self.device = config.device
131
+ self.hubert_model = None
132
+ self.tgt_sr = None
133
+ self.net_g = None
134
+ self.vc = None
135
+ self.cpt = None
136
+ self.version = None
137
+ self.n_spk = None
138
+ self.use_f0 = None
139
+ self.loaded_model = None
140
+ self.vocoder = "Default"
141
+ self.sample_rate = 16000
142
+ self.sid = sid
143
+ self.get_vc(model_path, sid)
144
+
145
+ def convert_audio(
146
+ self,
147
+ audio_input_path,
148
+ audio_output_path,
149
+ index_path,
150
+ embedder_model,
151
+ pitch,
152
+ f0_method,
153
+ index_rate,
154
+ volume_envelope,
155
+ protect,
156
+ hop_length,
157
+ filter_radius,
158
+ export_format,
159
+ resample_sr = 0,
160
+ f0_autotune=False,
161
+ f0_autotune_strength=1,
162
+ split_audio=False,
163
+ clean_audio=False,
164
+ clean_strength=0.5
165
+ ):
166
+ try:
167
+ audio = load_audio(audio_input_path, self.sample_rate)
168
+ audio_max = np.abs(audio).max() / 0.95
169
+ if audio_max > 1: audio /= audio_max
170
+
171
+ if not self.hubert_model:
172
+ embedder_model_path = os.path.join("models", embedder_model + ".pt")
173
+ if not os.path.exists(embedder_model_path): raise FileNotFoundError(f"[ERROR] Not found embeddeder: {embedder_model}")
174
+
175
+ models = fairseq.load_model(embedder_model_path).to(self.device).eval()
176
+ self.hubert_model = models.half() if self.config.is_half else models.float()
177
+
178
+ if split_audio:
179
+ chunks = cut(
180
+ audio,
181
+ self.sample_rate,
182
+ db_thresh=-60,
183
+ min_interval=500
184
+ )
185
+ print(f"Split Total: {len(chunks)}")
186
+ else: chunks = [(audio, 0, 0)]
187
+
188
+ converted_chunks = [
189
+ (
190
+ start,
191
+ end,
192
+ self.vc.pipeline(
193
+ model=self.hubert_model,
194
+ net_g=self.net_g,
195
+ sid=self.sid,
196
+ audio=waveform,
197
+ f0_up_key=pitch,
198
+ f0_method=f0_method,
199
+ file_index=(
200
+ index_path.strip().strip('"').strip("\n").strip('"').strip().replace("trained", "added")
201
+ ),
202
+ index_rate=index_rate,
203
+ pitch_guidance=self.use_f0,
204
+ filter_radius=filter_radius,
205
+ volume_envelope=volume_envelope,
206
+ version=self.version,
207
+ protect=protect,
208
+ hop_length=hop_length,
209
+ energy_use=self.energy,
210
+ f0_autotune=f0_autotune,
211
+ f0_autotune_strength=f0_autotune_strength
212
+ )
213
+ ) for waveform, start, end in chunks
214
+ ]
215
+
216
+ audio_output = restore(
217
+ converted_chunks,
218
+ total_len=len(audio),
219
+ dtype=converted_chunks[0][2].dtype
220
+ ) if split_audio else converted_chunks[0][2]
221
+
222
+ if self.tgt_sr != resample_sr and resample_sr > 0:
223
+ audio_output = librosa.resample(audio_output, orig_sr=self.tgt_sr, target_sr=resample_sr, res_type="soxr_vhq")
224
+ self.tgt_sr = resample_sr
225
+
226
+ if clean_audio:
227
+ from modules.noisereduce import reduce_noise
228
+ audio_output = reduce_noise(
229
+ y=audio_output,
230
+ sr=self.tgt_sr,
231
+ prop_decrease=clean_strength,
232
+ device=self.device
233
+ )
234
+
235
+ sf.write(audio_output_path, audio_output, self.tgt_sr, format=export_format)
236
+ except Exception as e:
237
+ import traceback
238
+ print(traceback.format_exc())
239
+ print(f"[ERROR] An error has occurred: {e}")
240
+
241
+ def get_vc(self, weight_root, sid):
242
+ if sid == "" or sid == []:
243
+ self.cleanup()
244
+ clear_gpu_cache()
245
+
246
+ if not self.loaded_model or self.loaded_model != weight_root:
247
+ self.loaded_model = weight_root
248
+ self.load_model()
249
+ if self.cpt is not None: self.setup()
250
+
251
+ def cleanup(self):
252
+ if self.hubert_model is not None:
253
+ del self.net_g, self.n_spk, self.vc, self.hubert_model, self.tgt_sr
254
+ self.hubert_model = self.net_g = self.n_spk = self.vc = self.tgt_sr = None
255
+ clear_gpu_cache()
256
+
257
+ del self.net_g, self.cpt
258
+ clear_gpu_cache()
259
+ self.cpt = None
260
+
261
+ def load_model(self):
262
+ if os.path.isfile(self.loaded_model): self.cpt = torch.load(self.loaded_model, map_location="cpu")
263
+ else: self.cpt = None
264
+
265
+ def setup(self):
266
+ if self.cpt is not None:
267
+ self.tgt_sr = self.cpt["config"][-1]
268
+ self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0]
269
+
270
+ self.use_f0 = self.cpt.get("f0", 1)
271
+ self.version = self.cpt.get("version", "v1")
272
+ self.vocoder = self.cpt.get("vocoder", "Default")
273
+ self.energy = self.cpt.get("energy", False)
274
+
275
+ if self.vocoder != "Default": self.config.is_half = False
276
+ self.net_g = Synthesizer(*self.cpt["config"], use_f0=self.use_f0, text_enc_hidden_dim=768 if self.version == "v2" else 256, vocoder=self.vocoder, energy=self.energy)
277
+ del self.net_g.enc_q
278
+
279
+ self.net_g.load_state_dict(self.cpt["weight"], strict=False)
280
+ self.net_g.eval().to(self.device)
281
+ self.net_g = (self.net_g.half() if self.config.is_half else self.net_g.float())
282
+ self.n_spk = self.cpt["config"][-3]
283
+
284
+ self.vc = Pipeline(self.tgt_sr, self.config)