Vijish commited on
Commit
2a0ee4f
·
verified ·
1 Parent(s): 5f563af

Upload 2 files

Browse files
Files changed (2) hide show
  1. vc_infer_pipeline.py +451 -0
  2. voice_processing.py +248 -0
vc_infer_pipeline.py ADDED
@@ -0,0 +1,451 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import traceback
4
+ from functools import lru_cache
5
+ from time import time as ttime
6
+
7
+ import faiss
8
+ import librosa
9
+ import numpy as np
10
+ import parselmouth
11
+ import pyworld
12
+ import torch
13
+ import torch.nn.functional as F
14
+ import torchcrepe
15
+ from scipy import signal
16
+
17
+ now_dir = os.getcwd()
18
+ sys.path.append(now_dir)
19
+
20
+ bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
21
+
22
+ input_audio_path2wav = {}
23
+
24
+
25
+ @lru_cache
26
+ def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
27
+ audio = input_audio_path2wav[input_audio_path]
28
+ f0, t = pyworld.harvest(
29
+ audio,
30
+ fs=fs,
31
+ f0_ceil=f0max,
32
+ f0_floor=f0min,
33
+ frame_period=frame_period,
34
+ )
35
+ f0 = pyworld.stonemask(audio, f0, t, fs)
36
+ return f0
37
+
38
+
39
+ def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比
40
+ # print(data1.max(),data2.max())
41
+ rms1 = librosa.feature.rms(
42
+ y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2
43
+ ) # 每半秒一个点
44
+ rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)
45
+ rms1 = torch.from_numpy(rms1)
46
+ rms1 = F.interpolate(
47
+ rms1.unsqueeze(0), size=data2.shape[0], mode="linear"
48
+ ).squeeze()
49
+ rms2 = torch.from_numpy(rms2)
50
+ rms2 = F.interpolate(
51
+ rms2.unsqueeze(0), size=data2.shape[0], mode="linear"
52
+ ).squeeze()
53
+ rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)
54
+ data2 *= (
55
+ torch.pow(rms1, torch.tensor(1 - rate))
56
+ * torch.pow(rms2, torch.tensor(rate - 1))
57
+ ).numpy()
58
+ return data2
59
+
60
+
61
+ class VC(object):
62
+ def __init__(self, tgt_sr, config):
63
+ self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
64
+ config.x_pad,
65
+ config.x_query,
66
+ config.x_center,
67
+ config.x_max,
68
+ config.is_half,
69
+ )
70
+ self.sr = 16000 # hubert输入采样率
71
+ self.window = 160 # 每帧点数
72
+ self.t_pad = self.sr * self.x_pad # 每条前后pad时间
73
+ self.t_pad_tgt = tgt_sr * self.x_pad
74
+ self.t_pad2 = self.t_pad * 2
75
+ self.t_query = self.sr * self.x_query # 查询切点前后查询时间
76
+ self.t_center = self.sr * self.x_center # 查询切点位置
77
+ self.t_max = self.sr * self.x_max # 免查询时长阈值
78
+ self.device = config.device
79
+
80
+ def get_f0(
81
+ self,
82
+ input_audio_path,
83
+ x,
84
+ p_len,
85
+ f0_up_key,
86
+ f0_method,
87
+ filter_radius,
88
+ inp_f0=None,
89
+ ):
90
+ global input_audio_path2wav
91
+ time_step = self.window / self.sr * 1000
92
+ f0_min = 50
93
+ f0_max = 1100
94
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
95
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
96
+ if f0_method == "pm":
97
+ f0 = (
98
+ parselmouth.Sound(x, self.sr)
99
+ .to_pitch_ac(
100
+ time_step=time_step / 1000,
101
+ voicing_threshold=0.6,
102
+ pitch_floor=f0_min,
103
+ pitch_ceiling=f0_max,
104
+ )
105
+ .selected_array["frequency"]
106
+ )
107
+ pad_size = (p_len - len(f0) + 1) // 2
108
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
109
+ f0 = np.pad(
110
+ f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
111
+ )
112
+ elif f0_method == "harvest":
113
+ input_audio_path2wav[input_audio_path] = x.astype(np.double)
114
+ f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
115
+ if filter_radius > 2:
116
+ f0 = signal.medfilt(f0, 3)
117
+ elif f0_method == "crepe":
118
+ model = "full"
119
+ # Pick a batch size that doesn't cause memory errors on your gpu
120
+ batch_size = 512
121
+ # Compute pitch using first gpu
122
+ audio = torch.tensor(np.copy(x))[None].float()
123
+ f0, pd = torchcrepe.predict(
124
+ audio,
125
+ self.sr,
126
+ self.window,
127
+ f0_min,
128
+ f0_max,
129
+ model,
130
+ batch_size=batch_size,
131
+ device=self.device,
132
+ return_periodicity=True,
133
+ )
134
+ pd = torchcrepe.filter.median(pd, 3)
135
+ f0 = torchcrepe.filter.mean(f0, 3)
136
+ f0[pd < 0.1] = 0
137
+ f0 = f0[0].cpu().numpy()
138
+ elif f0_method == "rmvpe":
139
+ if hasattr(self, "model_rmvpe") == False:
140
+ from rmvpe import RMVPE
141
+
142
+ print("loading rmvpe model")
143
+ self.model_rmvpe = RMVPE(
144
+ "rmvpe.pt", is_half=self.is_half, device=self.device
145
+ )
146
+ f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
147
+ f0 *= pow(2, f0_up_key / 12)
148
+ # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
149
+ tf0 = self.sr // self.window # 每秒f0点数
150
+ if inp_f0 is not None:
151
+ delta_t = np.round(
152
+ (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
153
+ ).astype("int16")
154
+ replace_f0 = np.interp(
155
+ list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
156
+ )
157
+ shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
158
+ f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
159
+ :shape
160
+ ]
161
+ # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
162
+ f0bak = f0.copy()
163
+ f0_mel = 1127 * np.log(1 + f0 / 700)
164
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
165
+ f0_mel_max - f0_mel_min
166
+ ) + 1
167
+ f0_mel[f0_mel <= 1] = 1
168
+ f0_mel[f0_mel > 255] = 255
169
+ f0_coarse = np.rint(f0_mel).astype(np.int)
170
+ return f0_coarse, f0bak # 1-0
171
+
172
+ def vc(
173
+ self,
174
+ model,
175
+ net_g,
176
+ sid,
177
+ audio0,
178
+ pitch,
179
+ pitchf,
180
+ times,
181
+ index,
182
+ big_npy,
183
+ index_rate,
184
+ version,
185
+ protect,
186
+ ): # ,file_index,file_big_npy
187
+ feats = torch.from_numpy(audio0)
188
+ if self.is_half:
189
+ feats = feats.half()
190
+ else:
191
+ feats = feats.float()
192
+ if feats.dim() == 2: # double channels
193
+ feats = feats.mean(-1)
194
+ assert feats.dim() == 1, feats.dim()
195
+ feats = feats.view(1, -1)
196
+ padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
197
+
198
+ inputs = {
199
+ "source": feats.to(self.device),
200
+ "padding_mask": padding_mask,
201
+ "output_layer": 9 if version == "v1" else 12,
202
+ }
203
+ t0 = ttime()
204
+ with torch.no_grad():
205
+ logits = model.extract_features(**inputs)
206
+ feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
207
+ if protect < 0.5 and pitch != None and pitchf != None:
208
+ feats0 = feats.clone()
209
+ if (
210
+ isinstance(index, type(None)) == False
211
+ and isinstance(big_npy, type(None)) == False
212
+ and index_rate != 0
213
+ ):
214
+ npy = feats[0].cpu().numpy()
215
+ if self.is_half:
216
+ npy = npy.astype("float32")
217
+
218
+ # _, I = index.search(npy, 1)
219
+ # npy = big_npy[I.squeeze()]
220
+
221
+ score, ix = index.search(npy, k=8)
222
+ weight = np.square(1 / score)
223
+ weight /= weight.sum(axis=1, keepdims=True)
224
+ npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
225
+
226
+ if self.is_half:
227
+ npy = npy.astype("float16")
228
+ feats = (
229
+ torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
230
+ + (1 - index_rate) * feats
231
+ )
232
+
233
+ feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
234
+ if protect < 0.5 and pitch != None and pitchf != None:
235
+ feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
236
+ 0, 2, 1
237
+ )
238
+ t1 = ttime()
239
+ p_len = audio0.shape[0] // self.window
240
+ if feats.shape[1] < p_len:
241
+ p_len = feats.shape[1]
242
+ if pitch != None and pitchf != None:
243
+ pitch = pitch[:, :p_len]
244
+ pitchf = pitchf[:, :p_len]
245
+
246
+ if protect < 0.5 and pitch != None and pitchf != None:
247
+ pitchff = pitchf.clone()
248
+ pitchff[pitchf > 0] = 1
249
+ pitchff[pitchf < 1] = protect
250
+ pitchff = pitchff.unsqueeze(-1)
251
+ feats = feats * pitchff + feats0 * (1 - pitchff)
252
+ feats = feats.to(feats0.dtype)
253
+ p_len = torch.tensor([p_len], device=self.device).long()
254
+ with torch.no_grad():
255
+ if pitch != None and pitchf != None:
256
+ audio1 = (
257
+ (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0])
258
+ .data.cpu()
259
+ .float()
260
+ .numpy()
261
+ )
262
+ else:
263
+ audio1 = (
264
+ (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy()
265
+ )
266
+ del feats, p_len, padding_mask
267
+ if torch.cuda.is_available():
268
+ torch.cuda.empty_cache()
269
+ t2 = ttime()
270
+ times[0] += t1 - t0
271
+ times[2] += t2 - t1
272
+ return audio1
273
+
274
+ def pipeline(
275
+ self,
276
+ model,
277
+ net_g,
278
+ sid,
279
+ audio,
280
+ input_audio_path,
281
+ times,
282
+ f0_up_key,
283
+ f0_method,
284
+ file_index,
285
+ # file_big_npy,
286
+ index_rate,
287
+ if_f0,
288
+ filter_radius,
289
+ tgt_sr,
290
+ resample_sr,
291
+ rms_mix_rate,
292
+ version,
293
+ protect,
294
+ f0_file=None,
295
+ ):
296
+ if (
297
+ file_index != ""
298
+ # and file_big_npy != ""
299
+ # and os.path.exists(file_big_npy) == True
300
+ and os.path.exists(file_index) == True
301
+ and index_rate != 0
302
+ ):
303
+ try:
304
+ index = faiss.read_index(file_index)
305
+ # big_npy = np.load(file_big_npy)
306
+ big_npy = index.reconstruct_n(0, index.ntotal)
307
+ except:
308
+ traceback.print_exc()
309
+ index = big_npy = None
310
+ else:
311
+ index = big_npy = None
312
+ audio = signal.filtfilt(bh, ah, audio)
313
+ audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
314
+ opt_ts = []
315
+ if audio_pad.shape[0] > self.t_max:
316
+ audio_sum = np.zeros_like(audio)
317
+ for i in range(self.window):
318
+ audio_sum += audio_pad[i : i - self.window]
319
+ for t in range(self.t_center, audio.shape[0], self.t_center):
320
+ opt_ts.append(
321
+ t
322
+ - self.t_query
323
+ + np.where(
324
+ np.abs(audio_sum[t - self.t_query : t + self.t_query])
325
+ == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
326
+ )[0][0]
327
+ )
328
+ s = 0
329
+ audio_opt = []
330
+ t = None
331
+ t1 = ttime()
332
+ audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
333
+ p_len = audio_pad.shape[0] // self.window
334
+ inp_f0 = None
335
+ if hasattr(f0_file, "name") == True:
336
+ try:
337
+ with open(f0_file.name, "r") as f:
338
+ lines = f.read().strip("\n").split("\n")
339
+ inp_f0 = []
340
+ for line in lines:
341
+ inp_f0.append([float(i) for i in line.split(",")])
342
+ inp_f0 = np.array(inp_f0, dtype="float32")
343
+ except:
344
+ traceback.print_exc()
345
+ sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
346
+ pitch, pitchf = None, None
347
+ if if_f0 == 1:
348
+ pitch, pitchf = self.get_f0(
349
+ input_audio_path,
350
+ audio_pad,
351
+ p_len,
352
+ f0_up_key,
353
+ f0_method,
354
+ filter_radius,
355
+ inp_f0,
356
+ )
357
+ pitch = pitch[:p_len]
358
+ pitchf = pitchf[:p_len]
359
+ if self.device == "mps":
360
+ pitchf = pitchf.astype(np.float32)
361
+ pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
362
+ pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
363
+ t2 = ttime()
364
+ times[1] += t2 - t1
365
+ for t in opt_ts:
366
+ t = t // self.window * self.window
367
+ if if_f0 == 1:
368
+ audio_opt.append(
369
+ self.vc(
370
+ model,
371
+ net_g,
372
+ sid,
373
+ audio_pad[s : t + self.t_pad2 + self.window],
374
+ pitch[:, s // self.window : (t + self.t_pad2) // self.window],
375
+ pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
376
+ times,
377
+ index,
378
+ big_npy,
379
+ index_rate,
380
+ version,
381
+ protect,
382
+ )[self.t_pad_tgt : -self.t_pad_tgt]
383
+ )
384
+ else:
385
+ audio_opt.append(
386
+ self.vc(
387
+ model,
388
+ net_g,
389
+ sid,
390
+ audio_pad[s : t + self.t_pad2 + self.window],
391
+ None,
392
+ None,
393
+ times,
394
+ index,
395
+ big_npy,
396
+ index_rate,
397
+ version,
398
+ protect,
399
+ )[self.t_pad_tgt : -self.t_pad_tgt]
400
+ )
401
+ s = t
402
+ if if_f0 == 1:
403
+ audio_opt.append(
404
+ self.vc(
405
+ model,
406
+ net_g,
407
+ sid,
408
+ audio_pad[t:],
409
+ pitch[:, t // self.window :] if t is not None else pitch,
410
+ pitchf[:, t // self.window :] if t is not None else pitchf,
411
+ times,
412
+ index,
413
+ big_npy,
414
+ index_rate,
415
+ version,
416
+ protect,
417
+ )[self.t_pad_tgt : -self.t_pad_tgt]
418
+ )
419
+ else:
420
+ audio_opt.append(
421
+ self.vc(
422
+ model,
423
+ net_g,
424
+ sid,
425
+ audio_pad[t:],
426
+ None,
427
+ None,
428
+ times,
429
+ index,
430
+ big_npy,
431
+ index_rate,
432
+ version,
433
+ protect,
434
+ )[self.t_pad_tgt : -self.t_pad_tgt]
435
+ )
436
+ audio_opt = np.concatenate(audio_opt)
437
+ if rms_mix_rate != 1:
438
+ audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
439
+ if resample_sr >= 16000 and tgt_sr != resample_sr:
440
+ audio_opt = librosa.resample(
441
+ audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
442
+ )
443
+ audio_max = np.abs(audio_opt).max() / 0.99
444
+ max_int16 = 32768
445
+ if audio_max > 1:
446
+ max_int16 /= audio_max
447
+ audio_opt = (audio_opt * max_int16).astype(np.int16)
448
+ del pitch, pitchf, sid
449
+ if torch.cuda.is_available():
450
+ torch.cuda.empty_cache()
451
+ return audio_opt
voice_processing.py ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import datetime
3
+ import logging
4
+ import os
5
+ import time
6
+ import traceback
7
+ import tempfile
8
+
9
+ import edge_tts
10
+ import librosa
11
+ import torch
12
+ from fairseq import checkpoint_utils
13
+ import uuid
14
+
15
+ from config import Config
16
+ from lib.infer_pack.models import (
17
+ SynthesizerTrnMs256NSFsid,
18
+ SynthesizerTrnMs256NSFsid_nono,
19
+ SynthesizerTrnMs768NSFsid,
20
+ SynthesizerTrnMs768NSFsid_nono,
21
+ )
22
+ from rmvpe import RMVPE
23
+ from vc_infer_pipeline import VC
24
+
25
+ # Set logging levels
26
+ logging.getLogger("fairseq").setLevel(logging.WARNING)
27
+ logging.getLogger("numba").setLevel(logging.WARNING)
28
+ logging.getLogger("markdown_it").setLevel(logging.WARNING)
29
+ logging.getLogger("urllib3").setLevel(logging.WARNING)
30
+ logging.getLogger("matplotlib").setLevel(logging.WARNING)
31
+
32
+ limitation = os.getenv("SYSTEM") == "spaces"
33
+
34
+ config = Config()
35
+
36
+ # Edge TTS
37
+ tts_voice_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices())
38
+ tts_voices = ["mn-MN-BataaNeural", "mn-MN-YesuiNeural"] # Specific voices
39
+
40
+ # RVC models
41
+ model_root = "weights"
42
+ models = [d for d in os.listdir(model_root) if os.path.isdir(f"{model_root}/{d}")]
43
+ models.sort()
44
+
45
+ def get_unique_filename(extension):
46
+ return f"{uuid.uuid4()}.{extension}"
47
+
48
+
49
+ #edge_output_filename = get_unique_filename("mp3")
50
+
51
+
52
+ def model_data(model_name):
53
+ # global n_spk, tgt_sr, net_g, vc, cpt, version, index_file
54
+ pth_path = [
55
+ f"{model_root}/{model_name}/{f}"
56
+ for f in os.listdir(f"{model_root}/{model_name}")
57
+ if f.endswith(".pth")
58
+ ][0]
59
+ print(f"Loading {pth_path}")
60
+ cpt = torch.load(pth_path, map_location="cpu")
61
+ tgt_sr = cpt["config"][-1]
62
+ cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
63
+ if_f0 = cpt.get("f0", 1)
64
+ version = cpt.get("version", "v1")
65
+ if version == "v1":
66
+ if if_f0 == 1:
67
+ net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
68
+ else:
69
+ net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
70
+ elif version == "v2":
71
+ if if_f0 == 1:
72
+ net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half)
73
+ else:
74
+ net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
75
+ else:
76
+ raise ValueError("Unknown version")
77
+ del net_g.enc_q
78
+ net_g.load_state_dict(cpt["weight"], strict=False)
79
+ print("Model loaded")
80
+ net_g.eval().to(config.device)
81
+ if config.is_half:
82
+ net_g = net_g.half()
83
+ else:
84
+ net_g = net_g.float()
85
+ vc = VC(tgt_sr, config)
86
+ # n_spk = cpt["config"][-3]
87
+
88
+ index_files = [
89
+ f"{model_root}/{model_name}/{f}"
90
+ for f in os.listdir(f"{model_root}/{model_name}")
91
+ if f.endswith(".index")
92
+ ]
93
+ if len(index_files) == 0:
94
+ print("No index file found")
95
+ index_file = ""
96
+ else:
97
+ index_file = index_files[0]
98
+ print(f"Index file found: {index_file}")
99
+
100
+ return tgt_sr, net_g, vc, version, index_file, if_f0
101
+
102
+
103
+ def load_hubert():
104
+ # global hubert_model
105
+ models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
106
+ ["hubert_base.pt"],
107
+ suffix="",
108
+ )
109
+ hubert_model = models[0]
110
+ hubert_model = hubert_model.to(config.device)
111
+ if config.is_half:
112
+ hubert_model = hubert_model.half()
113
+ else:
114
+ hubert_model = hubert_model.float()
115
+ return hubert_model.eval()
116
+
117
+ def get_model_names():
118
+ model_root = "weights" # Assuming this is where your models are stored
119
+ return [d for d in os.listdir(model_root) if os.path.isdir(f"{model_root}/{d}")]
120
+
121
+ async def tts(
122
+ model_name,
123
+ tts_text,
124
+ tts_voice,
125
+ index_rate,
126
+ use_uploaded_voice,
127
+ uploaded_voice,
128
+ ):
129
+ # Default values for parameters used in EdgeTTS
130
+ speed = 0 # Default speech speed
131
+ f0_up_key = 0 # Default pitch adjustment
132
+ f0_method = "rmvpe" # Default pitch extraction method
133
+ protect = 0.33 # Default protect value
134
+ filter_radius = 3
135
+ resample_sr = 0
136
+ rms_mix_rate = 0.25
137
+ edge_time = 0 # Initialize edge_time
138
+
139
+ edge_output_filename = get_unique_filename("mp3")
140
+
141
+
142
+ try:
143
+ if use_uploaded_voice:
144
+ if uploaded_voice is None:
145
+ return "No voice file uploaded.", None, None
146
+
147
+ # Process the uploaded voice file
148
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
149
+ tmp_file.write(uploaded_voice)
150
+ uploaded_file_path = tmp_file.name
151
+
152
+ #uploaded_file_path = uploaded_voice.name
153
+ audio, sr = librosa.load(uploaded_file_path, sr=16000, mono=True)
154
+ else:
155
+ # EdgeTTS processing
156
+ if limitation and len(tts_text) > 4000:
157
+ return (
158
+ f"Text characters should be at most 280 in this huggingface space, but got {len(tts_text)} characters.",
159
+ None,
160
+ None,
161
+ )
162
+
163
+ # Invoke Edge TTS
164
+ t0 = time.time()
165
+ speed_str = f"+{speed}%" if speed >= 0 else f"{speed}%"
166
+ await edge_tts.Communicate(
167
+ tts_text, tts_voice, rate=speed_str
168
+ ).save(edge_output_filename)
169
+ t1 = time.time()
170
+ edge_time = t1 - t0
171
+
172
+ audio, sr = librosa.load(edge_output_filename, sr=16000, mono=True)
173
+
174
+ # Common processing after loading the audio
175
+ duration = len(audio) / sr
176
+ print(f"Audio duration: {duration}s")
177
+ if limitation and duration >= 20:
178
+ return (
179
+ f"Audio should be less than 20 seconds in this huggingface space, but got {duration}s.",
180
+ None,
181
+ None,
182
+ )
183
+
184
+ f0_up_key = int(f0_up_key)
185
+ tgt_sr, net_g, vc, version, index_file, if_f0 = model_data(model_name)
186
+
187
+ # Setup for RMVPE or other pitch extraction methods
188
+ if f0_method == "rmvpe":
189
+ vc.model_rmvpe = rmvpe_model
190
+
191
+ # Perform voice conversion pipeline
192
+ times = [0, 0, 0]
193
+ audio_opt = vc.pipeline(
194
+ hubert_model,
195
+ net_g,
196
+ 0,
197
+ audio,
198
+ edge_output_filename if not use_uploaded_voice else uploaded_file_path,
199
+ times,
200
+ f0_up_key,
201
+ f0_method,
202
+ index_file,
203
+ index_rate,
204
+ if_f0,
205
+ filter_radius,
206
+ tgt_sr,
207
+ resample_sr,
208
+ rms_mix_rate,
209
+ version,
210
+ protect,
211
+ None,
212
+ )
213
+
214
+ if tgt_sr != resample_sr and resample_sr >= 16000:
215
+ tgt_sr = resample_sr
216
+
217
+ info = f"Success. Time: tts: {edge_time}s, npy: {times[0]}s, f0: {times[1]}s, infer: {times[2]}s"
218
+ print(info)
219
+ return (
220
+ info,
221
+ edge_output_filename if not use_uploaded_voice else None,
222
+ (tgt_sr, audio_opt),
223
+ edge_output_filename
224
+ )
225
+
226
+ except EOFError:
227
+ info = (
228
+ "output not valid. This may occur when input text and speaker do not match."
229
+ )
230
+ print(info)
231
+ return info, None, None
232
+ except Exception as e:
233
+ traceback_info = traceback.format_exc()
234
+ print(traceback_info)
235
+ return str(e), None, None
236
+
237
+
238
+ voice_mapping = {
239
+ "Mongolian Male": "mn-MN-BataaNeural",
240
+ "Mongolian Female": "mn-MN-YesuiNeural"
241
+ }
242
+
243
+
244
+
245
+ hubert_model = load_hubert()
246
+
247
+ rmvpe_model = RMVPE("rmvpe.pt", config.is_half, config.device)
248
+