Kit-Lemonfoot commited on
Commit
c8d8351
1 Parent(s): 9b54243

GPTSV_FI Update Part 1

Browse files
.gitignore CHANGED
@@ -10,3 +10,8 @@ reference
10
  GPT_weights
11
  SoVITS_weights
12
  TEMP
 
 
 
 
 
 
10
  GPT_weights
11
  SoVITS_weights
12
  TEMP
13
+ PortableGit
14
+ ffmpeg.exe
15
+ ffprobe.exe
16
+ tmp_audio
17
+ trained
api.py CHANGED
@@ -1,559 +1,734 @@
1
- """
2
- # api.py usage
3
-
4
- ` python api.py -dr "123.wav" -dt "一二三。" -dl "zh" `
5
-
6
- ## 执行参数:
7
-
8
- `-s` - `SoVITS模型路径, 可在 config.py 中指定`
9
- `-g` - `GPT模型路径, 可在 config.py 中指定`
10
-
11
- 调用请求缺少参考音频时使用
12
- `-dr` - `默认参考音频路径`
13
- `-dt` - `默认参考音频文本`
14
- `-dl` - `默认参考音频语种, "中文","英文","日文","zh","en","ja"`
15
-
16
- `-d` - `推理设备, "cuda","cpu"`
17
- `-a` - `绑定地址, 默认"127.0.0.1"`
18
- `-p` - `绑定端口, 默认9880, 可在 config.py 中指定`
19
- `-fp` - `覆盖 config.py 使用全精度`
20
- `-hp` - `覆盖 config.py 使用半精度`
21
-
22
- `-hb` - `cnhubert路径`
23
- `-b` - `bert路径`
24
-
25
- ## 调用:
26
-
27
- ### 推理
28
-
29
- endpoint: `/`
30
-
31
- 使用执行参数指定的参考音频:
32
- GET:
33
- `http://127.0.0.1:9880?text=先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。&text_language=zh`
34
- POST:
35
- ```json
36
- {
37
- "text": "先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。",
38
- "text_language": "zh"
39
- }
40
- ```
41
-
42
- 手动指定当次推理所使用的参考音频:
43
- GET:
44
- `http://127.0.0.1:9880?refer_wav_path=123.wav&prompt_text=一二三。&prompt_language=zh&text=先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。&text_language=zh`
45
- POST:
46
- ```json
47
- {
48
- "refer_wav_path": "123.wav",
49
- "prompt_text": "一二三。",
50
- "prompt_language": "zh",
51
- "text": "先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。",
52
- "text_language": "zh"
53
- }
54
- ```
55
-
56
- RESP:
57
- 成功: 直接返回 wav 音频流, http code 200
58
- 失败: 返回包含错误信息的 json, http code 400
59
-
60
-
61
- ### 更换默认参考音频
62
-
63
- endpoint: `/change_refer`
64
-
65
- key与推理端一样
66
-
67
- GET:
68
- `http://127.0.0.1:9880/change_refer?refer_wav_path=123.wav&prompt_text=一二三。&prompt_language=zh`
69
- POST:
70
- ```json
71
- {
72
- "refer_wav_path": "123.wav",
73
- "prompt_text": "一二三。",
74
- "prompt_language": "zh"
75
- }
76
- ```
77
-
78
- RESP:
79
- 成功: json, http code 200
80
- 失败: json, 400
81
-
82
-
83
- ### 命令控制
84
-
85
- endpoint: `/control`
86
-
87
- command:
88
- "restart": 重新运行
89
- "exit": 结束运行
90
-
91
- GET:
92
- `http://127.0.0.1:9880/control?command=restart`
93
- POST:
94
- ```json
95
- {
96
- "command": "restart"
97
- }
98
- ```
99
-
100
- RESP:
101
-
102
- """
103
-
104
-
105
- import argparse
106
- import os
107
- import sys
108
-
109
- now_dir = os.getcwd()
110
- sys.path.append(now_dir)
111
- sys.path.append("%s/GPT_SoVITS" % (now_dir))
112
-
113
- import signal
114
- from time import time as ttime
115
- import torch
116
- import librosa
117
- import soundfile as sf
118
- from fastapi import FastAPI, Request, HTTPException
119
- from fastapi.responses import StreamingResponse, JSONResponse
120
- import uvicorn
121
- from transformers import AutoModelForMaskedLM, AutoTokenizer
122
- import numpy as np
123
- from feature_extractor import cnhubert
124
- from io import BytesIO
125
- from module.models import SynthesizerTrn
126
- from AR.models.t2s_lightning_module import Text2SemanticLightningModule
127
- from text import cleaned_text_to_sequence
128
- from text.cleaner import clean_text
129
- from module.mel_processing import spectrogram_torch
130
- from my_utils import load_audio
131
- import config as global_config
132
-
133
- g_config = global_config.Config()
134
-
135
- # AVAILABLE_COMPUTE = "cuda" if torch.cuda.is_available() else "cpu"
136
-
137
- parser = argparse.ArgumentParser(description="GPT-SoVITS api")
138
-
139
- parser.add_argument("-s", "--sovits_path", type=str, default=g_config.sovits_path, help="SoVITS模型路径")
140
- parser.add_argument("-g", "--gpt_path", type=str, default=g_config.gpt_path, help="GPT模型路径")
141
-
142
- parser.add_argument("-dr", "--default_refer_path", type=str, default="", help="默认参考音频路径")
143
- parser.add_argument("-dt", "--default_refer_text", type=str, default="", help="默认参考音频文本")
144
- parser.add_argument("-dl", "--default_refer_language", type=str, default="", help="默认参考音频语种")
145
-
146
- parser.add_argument("-d", "--device", type=str, default=g_config.infer_device, help="cuda / cpu")
147
- parser.add_argument("-a", "--bind_addr", type=str, default="0.0.0.0", help="default: 0.0.0.0")
148
- parser.add_argument("-p", "--port", type=int, default=g_config.api_port, help="default: 9880")
149
- parser.add_argument("-fp", "--full_precision", action="store_true", default=False, help="覆盖config.is_half为False, 使用全精度")
150
- parser.add_argument("-hp", "--half_precision", action="store_true", default=False, help="覆盖config.is_half为True, 使用半精度")
151
- # bool值的用法为 `python ./api.py -fp ...`
152
- # 此时 full_precision==True, half_precision==False
153
-
154
- parser.add_argument("-hb", "--hubert_path", type=str, default=g_config.cnhubert_path, help="覆盖config.cnhubert_path")
155
- parser.add_argument("-b", "--bert_path", type=str, default=g_config.bert_path, help="覆盖config.bert_path")
156
-
157
- args = parser.parse_args()
158
-
159
- sovits_path = args.sovits_path
160
- gpt_path = args.gpt_path
161
-
162
-
163
- class DefaultRefer:
164
- def __init__(self, path, text, language):
165
- self.path = args.default_refer_path
166
- self.text = args.default_refer_text
167
- self.language = args.default_refer_language
168
-
169
- def is_ready(self) -> bool:
170
- return is_full(self.path, self.text, self.language)
171
-
172
-
173
- default_refer = DefaultRefer(args.default_refer_path, args.default_refer_text, args.default_refer_language)
174
-
175
- device = args.device
176
- port = args.port
177
- host = args.bind_addr
178
-
179
- if sovits_path == "":
180
- sovits_path = g_config.pretrained_sovits_path
181
- print(f"[WARN] 未指定SoVITS模型路径, fallback后当前值: {sovits_path}")
182
- if gpt_path == "":
183
- gpt_path = g_config.pretrained_gpt_path
184
- print(f"[WARN] 未指定GPT模型路径, fallback后当前值: {gpt_path}")
185
-
186
- # 指定默认参考音频, 调用方 未提供/未给全 参考音频参数时使用
187
- if default_refer.path == "" or default_refer.text == "" or default_refer.language == "":
188
- default_refer.path, default_refer.text, default_refer.language = "", "", ""
189
- print("[INFO] 未指定默认参考音频")
190
- else:
191
- print(f"[INFO] 默认参考音频路径: {default_refer.path}")
192
- print(f"[INFO] 默认参考音频文本: {default_refer.text}")
193
- print(f"[INFO] 默认参考音频语种: {default_refer.language}")
194
-
195
- is_half = g_config.is_half
196
- if args.full_precision:
197
- is_half = False
198
- if args.half_precision:
199
- is_half = True
200
- if args.full_precision and args.half_precision:
201
- is_half = g_config.is_half # 炒饭fallback
202
-
203
- print(f"[INFO] 半精: {is_half}")
204
-
205
- cnhubert_base_path = args.hubert_path
206
- bert_path = args.bert_path
207
-
208
- cnhubert.cnhubert_base_path = cnhubert_base_path
209
- tokenizer = AutoTokenizer.from_pretrained(bert_path)
210
- bert_model = AutoModelForMaskedLM.from_pretrained(bert_path)
211
- if is_half:
212
- bert_model = bert_model.half().to(device)
213
- else:
214
- bert_model = bert_model.to(device)
215
-
216
-
217
- def is_empty(*items): # 任意一项不为空返回False
218
- for item in items:
219
- if item is not None and item != "":
220
- return False
221
- return True
222
-
223
-
224
- def is_full(*items): # 任意一项为空返回False
225
- for item in items:
226
- if item is None or item == "":
227
- return False
228
- return True
229
-
230
- def change_sovits_weights(sovits_path):
231
- global vq_model, hps
232
- dict_s2 = torch.load(sovits_path, map_location="cpu")
233
- hps = dict_s2["config"]
234
- hps = DictToAttrRecursive(hps)
235
- hps.model.semantic_frame_rate = "25hz"
236
- vq_model = SynthesizerTrn(
237
- hps.data.filter_length // 2 + 1,
238
- hps.train.segment_size // hps.data.hop_length,
239
- n_speakers=hps.data.n_speakers,
240
- **hps.model
241
- )
242
- if ("pretrained" not in sovits_path):
243
- del vq_model.enc_q
244
- if is_half == True:
245
- vq_model = vq_model.half().to(device)
246
- else:
247
- vq_model = vq_model.to(device)
248
- vq_model.eval()
249
- print(vq_model.load_state_dict(dict_s2["weight"], strict=False))
250
- with open("./sweight.txt", "w", encoding="utf-8") as f:
251
- f.write(sovits_path)
252
- def change_gpt_weights(gpt_path):
253
- global hz, max_sec, t2s_model, config
254
- hz = 50
255
- dict_s1 = torch.load(gpt_path, map_location="cpu")
256
- config = dict_s1["config"]
257
- max_sec = config["data"]["max_sec"]
258
- t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)
259
- t2s_model.load_state_dict(dict_s1["weight"])
260
- if is_half == True:
261
- t2s_model = t2s_model.half()
262
- t2s_model = t2s_model.to(device)
263
- t2s_model.eval()
264
- total = sum([param.nelement() for param in t2s_model.parameters()])
265
- print("Number of parameter: %.2fM" % (total / 1e6))
266
- with open("./gweight.txt", "w", encoding="utf-8") as f: f.write(gpt_path)
267
-
268
-
269
- def get_bert_feature(text, word2ph):
270
- with torch.no_grad():
271
- inputs = tokenizer(text, return_tensors="pt")
272
- for i in inputs:
273
- inputs[i] = inputs[i].to(device) #####输入是long不用管精度问题,精度随bert_model
274
- res = bert_model(**inputs, output_hidden_states=True)
275
- res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1]
276
- assert len(word2ph) == len(text)
277
- phone_level_feature = []
278
- for i in range(len(word2ph)):
279
- repeat_feature = res[i].repeat(word2ph[i], 1)
280
- phone_level_feature.append(repeat_feature)
281
- phone_level_feature = torch.cat(phone_level_feature, dim=0)
282
- # if(is_half==True):phone_level_feature=phone_level_feature.half()
283
- return phone_level_feature.T
284
-
285
-
286
- n_semantic = 1024
287
- dict_s2 = torch.load(sovits_path, map_location="cpu")
288
- hps = dict_s2["config"]
289
-
290
-
291
- class DictToAttrRecursive:
292
- def __init__(self, input_dict):
293
- for key, value in input_dict.items():
294
- if isinstance(value, dict):
295
- # 如果值是字典,递归调用构造函数
296
- setattr(self, key, DictToAttrRecursive(value))
297
- else:
298
- setattr(self, key, value)
299
-
300
-
301
- hps = DictToAttrRecursive(hps)
302
- hps.model.semantic_frame_rate = "25hz"
303
- dict_s1 = torch.load(gpt_path, map_location="cpu")
304
- config = dict_s1["config"]
305
- ssl_model = cnhubert.get_model()
306
- if is_half:
307
- ssl_model = ssl_model.half().to(device)
308
- else:
309
- ssl_model = ssl_model.to(device)
310
-
311
- vq_model = SynthesizerTrn(
312
- hps.data.filter_length // 2 + 1,
313
- hps.train.segment_size // hps.data.hop_length,
314
- n_speakers=hps.data.n_speakers,
315
- **hps.model)
316
- if is_half:
317
- vq_model = vq_model.half().to(device)
318
- else:
319
- vq_model = vq_model.to(device)
320
- vq_model.eval()
321
- print(vq_model.load_state_dict(dict_s2["weight"], strict=False))
322
- hz = 50
323
- max_sec = config['data']['max_sec']
324
- t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)
325
- t2s_model.load_state_dict(dict_s1["weight"])
326
- if is_half:
327
- t2s_model = t2s_model.half()
328
- t2s_model = t2s_model.to(device)
329
- t2s_model.eval()
330
- total = sum([param.nelement() for param in t2s_model.parameters()])
331
- print("Number of parameter: %.2fM" % (total / 1e6))
332
-
333
-
334
- def get_spepc(hps, filename):
335
- audio = load_audio(filename, int(hps.data.sampling_rate))
336
- audio = torch.FloatTensor(audio)
337
- audio_norm = audio
338
- audio_norm = audio_norm.unsqueeze(0)
339
- spec = spectrogram_torch(audio_norm, hps.data.filter_length, hps.data.sampling_rate, hps.data.hop_length,
340
- hps.data.win_length, center=False)
341
- return spec
342
-
343
-
344
- dict_language = {
345
- "中文": "zh",
346
- "英文": "en",
347
- "日文": "ja",
348
- "ZH": "zh",
349
- "EN": "en",
350
- "JA": "ja",
351
- "zh": "zh",
352
- "en": "en",
353
- "ja": "ja"
354
- }
355
-
356
-
357
- def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language):
358
- t0 = ttime()
359
- prompt_text = prompt_text.strip("\n")
360
- prompt_language, text = prompt_language, text.strip("\n")
361
- zero_wav = np.zeros(int(hps.data.sampling_rate * 0.3), dtype=np.float16 if is_half == True else np.float32)
362
- with torch.no_grad():
363
- wav16k, sr = librosa.load(ref_wav_path, sr=16000)
364
- wav16k = torch.from_numpy(wav16k)
365
- zero_wav_torch = torch.from_numpy(zero_wav)
366
- if (is_half == True):
367
- wav16k = wav16k.half().to(device)
368
- zero_wav_torch = zero_wav_torch.half().to(device)
369
- else:
370
- wav16k = wav16k.to(device)
371
- zero_wav_torch = zero_wav_torch.to(device)
372
- wav16k = torch.cat([wav16k, zero_wav_torch])
373
- ssl_content = ssl_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(1, 2) # .float()
374
- codes = vq_model.extract_latent(ssl_content)
375
- prompt_semantic = codes[0, 0]
376
- t1 = ttime()
377
- prompt_language = dict_language[prompt_language]
378
- text_language = dict_language[text_language]
379
- phones1, word2ph1, norm_text1 = clean_text(prompt_text, prompt_language)
380
- phones1 = cleaned_text_to_sequence(phones1)
381
- texts = text.split("\n")
382
- audio_opt = []
383
-
384
- for text in texts:
385
- phones2, word2ph2, norm_text2 = clean_text(text, text_language)
386
- phones2 = cleaned_text_to_sequence(phones2)
387
- if (prompt_language == "zh"):
388
- bert1 = get_bert_feature(norm_text1, word2ph1).to(device)
389
- else:
390
- bert1 = torch.zeros((1024, len(phones1)), dtype=torch.float16 if is_half == True else torch.float32).to(
391
- device)
392
- if (text_language == "zh"):
393
- bert2 = get_bert_feature(norm_text2, word2ph2).to(device)
394
- else:
395
- bert2 = torch.zeros((1024, len(phones2))).to(bert1)
396
- bert = torch.cat([bert1, bert2], 1)
397
-
398
- all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0)
399
- bert = bert.to(device).unsqueeze(0)
400
- all_phoneme_len = torch.tensor([all_phoneme_ids.shape[-1]]).to(device)
401
- prompt = prompt_semantic.unsqueeze(0).to(device)
402
- t2 = ttime()
403
- with torch.no_grad():
404
- # pred_semantic = t2s_model.model.infer(
405
- pred_semantic, idx = t2s_model.model.infer_panel(
406
- all_phoneme_ids,
407
- all_phoneme_len,
408
- prompt,
409
- bert,
410
- # prompt_phone_len=ph_offset,
411
- top_k=config['inference']['top_k'],
412
- early_stop_num=hz * max_sec)
413
- t3 = ttime()
414
- # print(pred_semantic.shape,idx)
415
- pred_semantic = pred_semantic[:, -idx:].unsqueeze(0) # .unsqueeze(0)#mq要多unsqueeze一次
416
- refer = get_spepc(hps, ref_wav_path) # .to(device)
417
- if (is_half == True):
418
- refer = refer.half().to(device)
419
- else:
420
- refer = refer.to(device)
421
- # audio = vq_model.decode(pred_semantic, all_phoneme_ids, refer).detach().cpu().numpy()[0, 0]
422
- audio = \
423
- vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0),
424
- refer).detach().cpu().numpy()[
425
- 0, 0] ###试试重建不带上prompt部分
426
- audio_opt.append(audio)
427
- audio_opt.append(zero_wav)
428
- t4 = ttime()
429
- print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
430
- yield hps.data.sampling_rate, (np.concatenate(audio_opt, 0) * 32768).astype(np.int16)
431
-
432
-
433
- def handle_control(command):
434
- if command == "restart":
435
- os.execl(g_config.python_exec, g_config.python_exec, *sys.argv)
436
- elif command == "exit":
437
- os.kill(os.getpid(), signal.SIGTERM)
438
- exit(0)
439
-
440
-
441
- def handle_change(path, text, language):
442
- if is_empty(path, text, language):
443
- return JSONResponse({"code": 400, "message": '缺少任意一项以下参数: "path", "text", "language"'}, status_code=400)
444
-
445
- if path != "" or path is not None:
446
- default_refer.path = path
447
- if text != "" or text is not None:
448
- default_refer.text = text
449
- if language != "" or language is not None:
450
- default_refer.language = language
451
-
452
- print(f"[INFO] 当前默认参考音频路径: {default_refer.path}")
453
- print(f"[INFO] 当前默认参考音频文本: {default_refer.text}")
454
- print(f"[INFO] 当前默认参考音频语种: {default_refer.language}")
455
- print(f"[INFO] is_ready: {default_refer.is_ready()}")
456
-
457
- return JSONResponse({"code": 0, "message": "Success"}, status_code=200)
458
-
459
-
460
- def handle(refer_wav_path, prompt_text, prompt_language, text, text_language):
461
- if (
462
- refer_wav_path == "" or refer_wav_path is None
463
- or prompt_text == "" or prompt_text is None
464
- or prompt_language == "" or prompt_language is None
465
- ):
466
- refer_wav_path, prompt_text, prompt_language = (
467
- default_refer.path,
468
- default_refer.text,
469
- default_refer.language,
470
- )
471
- if not default_refer.is_ready():
472
- return JSONResponse({"code": 400, "message": "未指定参考音频且接口无预设"}, status_code=400)
473
-
474
- with torch.no_grad():
475
- gen = get_tts_wav(
476
- refer_wav_path, prompt_text, prompt_language, text, text_language
477
- )
478
- sampling_rate, audio_data = next(gen)
479
-
480
- wav = BytesIO()
481
- sf.write(wav, audio_data, sampling_rate, format="wav")
482
- wav.seek(0)
483
-
484
- torch.cuda.empty_cache()
485
- return StreamingResponse(wav, media_type="audio/wav")
486
-
487
-
488
- app = FastAPI()
489
-
490
- #clark新增-----2024-02-21
491
- #可在启动后动态修改模型,以此满足同一个api不同的朗读者请求
492
- @app.post("/set_model")
493
- async def set_model(request: Request):
494
- json_post_raw = await request.json()
495
- global gpt_path
496
- gpt_path=json_post_raw.get("gpt_model_path")
497
- global sovits_path
498
- sovits_path=json_post_raw.get("sovits_model_path")
499
- print("gptpath"+gpt_path+";vitspath"+sovits_path)
500
- change_sovits_weights(sovits_path)
501
- change_gpt_weights(gpt_path)
502
- return "ok"
503
- # 新增-----end------
504
-
505
- @app.post("/control")
506
- async def control(request: Request):
507
- json_post_raw = await request.json()
508
- return handle_control(json_post_raw.get("command"))
509
-
510
-
511
- @app.get("/control")
512
- async def control(command: str = None):
513
- return handle_control(command)
514
-
515
-
516
- @app.post("/change_refer")
517
- async def change_refer(request: Request):
518
- json_post_raw = await request.json()
519
- return handle_change(
520
- json_post_raw.get("refer_wav_path"),
521
- json_post_raw.get("prompt_text"),
522
- json_post_raw.get("prompt_language")
523
- )
524
-
525
-
526
- @app.get("/change_refer")
527
- async def change_refer(
528
- refer_wav_path: str = None,
529
- prompt_text: str = None,
530
- prompt_language: str = None
531
- ):
532
- return handle_change(refer_wav_path, prompt_text, prompt_language)
533
-
534
-
535
- @app.post("/")
536
- async def tts_endpoint(request: Request):
537
- json_post_raw = await request.json()
538
- return handle(
539
- json_post_raw.get("refer_wav_path"),
540
- json_post_raw.get("prompt_text"),
541
- json_post_raw.get("prompt_language"),
542
- json_post_raw.get("text"),
543
- json_post_raw.get("text_language"),
544
- )
545
-
546
-
547
- @app.get("/")
548
- async def tts_endpoint(
549
- refer_wav_path: str = None,
550
- prompt_text: str = None,
551
- prompt_language: str = None,
552
- text: str = None,
553
- text_language: str = None,
554
- ):
555
- return handle(refer_wav_path, prompt_text, prompt_language, text, text_language)
556
-
557
-
558
- if __name__ == "__main__":
559
- uvicorn.run(app, host=host, port=port, workers=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ # api.py usage
3
+
4
+ ` python api.py -dr "123.wav" -dt "一二三。" -dl "zh" `
5
+
6
+ ## 执行参数:
7
+
8
+ `-s` - `SoVITS模型路径, 可在 config.py 中指定`
9
+ `-g` - `GPT模型路径, 可在 config.py 中指定`
10
+
11
+ 调用请求缺少参考音频时使用
12
+ `-dr` - `默认参考音频路径`
13
+ `-dt` - `默认参考音频文本`
14
+ `-dl` - `默认参考音频语种, "中文","英文","日文","zh","en","ja"`
15
+
16
+ `-d` - `推理设备, "cuda","cpu"`
17
+ `-a` - `绑定地址, 默认"127.0.0.1"`
18
+ `-p` - `绑定端口, 默认9880, 可在 config.py 中指定`
19
+ `-fp` - `覆盖 config.py 使用全精度`
20
+ `-hp` - `覆盖 config.py 使用半精度`
21
+ `-sm` - `流式返回模式, 默认不启用, "close","c", "normal","n", "keepalive","k"`
22
+ ·-mt` - `返回的音频编码格式, 流式默认ogg, 非流式默认wav, "wav", "ogg", "aac"`
23
+ ·-cp` - `文本切分符号设定, 默认为空, 以",.,。"字符串的方式传入`
24
+
25
+ `-hb` - `cnhubert路径`
26
+ `-b` - `bert路径`
27
+
28
+ ## 调用:
29
+
30
+ ### 推理
31
+
32
+ endpoint: `/`
33
+
34
+ 使用执行参数指定的参考音频:
35
+ GET:
36
+ `http://127.0.0.1:9880?text=先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。&text_language=zh`
37
+ POST:
38
+ ```json
39
+ {
40
+ "text": "先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。",
41
+ "text_language": "zh"
42
+ }
43
+ ```
44
+
45
+ 使用执行参数指定的参考音频并设定分割符号:
46
+ GET:
47
+ `http://127.0.0.1:9880?text=先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。&text_language=zh&cut_punc=,。`
48
+ POST:
49
+ ```json
50
+ {
51
+ "text": "先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。",
52
+ "text_language": "zh",
53
+ "cut_punc": ",。",
54
+ }
55
+ ```
56
+
57
+ 手动指定当次推理所使用的参考音频:
58
+ GET:
59
+ `http://127.0.0.1:9880?refer_wav_path=123.wav&prompt_text=一二三。&prompt_language=zh&text=先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。&text_language=zh`
60
+ POST:
61
+ ```json
62
+ {
63
+ "refer_wav_path": "123.wav",
64
+ "prompt_text": "一二三。",
65
+ "prompt_language": "zh",
66
+ "text": "先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。",
67
+ "text_language": "zh"
68
+ }
69
+ ```
70
+
71
+ RESP:
72
+ 成功: 直接返回 wav 音频流, http code 200
73
+ 失败: 返回包含错误信息的 json, http code 400
74
+
75
+
76
+ ### 更换默认参考音频
77
+
78
+ endpoint: `/change_refer`
79
+
80
+ key与推理端一样
81
+
82
+ GET:
83
+ `http://127.0.0.1:9880/change_refer?refer_wav_path=123.wav&prompt_text=一二三。&prompt_language=zh`
84
+ POST:
85
+ ```json
86
+ {
87
+ "refer_wav_path": "123.wav",
88
+ "prompt_text": "一二三。",
89
+ "prompt_language": "zh"
90
+ }
91
+ ```
92
+
93
+ RESP:
94
+ 成功: json, http code 200
95
+ 失败: json, 400
96
+
97
+
98
+ ### 命令控制
99
+
100
+ endpoint: `/control`
101
+
102
+ command:
103
+ "restart": 重新运行
104
+ "exit": 结束运行
105
+
106
+ GET:
107
+ `http://127.0.0.1:9880/control?command=restart`
108
+ POST:
109
+ ```json
110
+ {
111
+ "command": "restart"
112
+ }
113
+ ```
114
+
115
+ RESP:
116
+
117
+ """
118
+
119
+
120
+ import argparse
121
+ import os,re
122
+ import sys
123
+ import signal
124
+ import LangSegment
125
+ from time import time as ttime
126
+ import torch
127
+ import librosa
128
+ import soundfile as sf
129
+ from fastapi import FastAPI, Request, HTTPException
130
+ from fastapi.responses import StreamingResponse, JSONResponse
131
+ import uvicorn
132
+ from transformers import AutoModelForMaskedLM, AutoTokenizer
133
+ import numpy as np
134
+ from feature_extractor import cnhubert
135
+ from io import BytesIO
136
+ from module.models import SynthesizerTrn
137
+ from AR.models.t2s_lightning_module import Text2SemanticLightningModule
138
+ from text import cleaned_text_to_sequence
139
+ from text.cleaner import clean_text
140
+ from module.mel_processing import spectrogram_torch
141
+ from my_utils import load_audio
142
+ import config as global_config
143
+ import logging
144
+ import subprocess
145
+
146
+
147
+ class DefaultRefer:
148
+ def __init__(self, path, text, language):
149
+ self.path = args.default_refer_path
150
+ self.text = args.default_refer_text
151
+ self.language = args.default_refer_language
152
+
153
+ def is_ready(self) -> bool:
154
+ return is_full(self.path, self.text, self.language)
155
+
156
+
157
+ def is_empty(*items): # 任意一项不为空返回False
158
+ for item in items:
159
+ if item is not None and item != "":
160
+ return False
161
+ return True
162
+
163
+
164
+ def is_full(*items): # 任意一项为空返回False
165
+ for item in items:
166
+ if item is None or item == "":
167
+ return False
168
+ return True
169
+
170
+
171
+ def change_sovits_weights(sovits_path):
172
+ global vq_model, hps
173
+ dict_s2 = torch.load(sovits_path, map_location="cpu")
174
+ hps = dict_s2["config"]
175
+ hps = DictToAttrRecursive(hps)
176
+ hps.model.semantic_frame_rate = "25hz"
177
+ vq_model = SynthesizerTrn(
178
+ hps.data.filter_length // 2 + 1,
179
+ hps.train.segment_size // hps.data.hop_length,
180
+ n_speakers=hps.data.n_speakers,
181
+ **hps.model
182
+ )
183
+ if ("pretrained" not in sovits_path):
184
+ del vq_model.enc_q
185
+ if is_half == True:
186
+ vq_model = vq_model.half().to(device)
187
+ else:
188
+ vq_model = vq_model.to(device)
189
+ vq_model.eval()
190
+ vq_model.load_state_dict(dict_s2["weight"], strict=False)
191
+
192
+
193
+ def change_gpt_weights(gpt_path):
194
+ global hz, max_sec, t2s_model, config
195
+ hz = 50
196
+ dict_s1 = torch.load(gpt_path, map_location="cpu")
197
+ config = dict_s1["config"]
198
+ max_sec = config["data"]["max_sec"]
199
+ t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)
200
+ t2s_model.load_state_dict(dict_s1["weight"])
201
+ if is_half == True:
202
+ t2s_model = t2s_model.half()
203
+ t2s_model = t2s_model.to(device)
204
+ t2s_model.eval()
205
+ total = sum([param.nelement() for param in t2s_model.parameters()])
206
+ logger.info("Number of parameter: %.2fM" % (total / 1e6))
207
+
208
+
209
+ def get_bert_feature(text, word2ph):
210
+ with torch.no_grad():
211
+ inputs = tokenizer(text, return_tensors="pt")
212
+ for i in inputs:
213
+ inputs[i] = inputs[i].to(device) #####输入是long不用管精度问题,精度随bert_model
214
+ res = bert_model(**inputs, output_hidden_states=True)
215
+ res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1]
216
+ assert len(word2ph) == len(text)
217
+ phone_level_feature = []
218
+ for i in range(len(word2ph)):
219
+ repeat_feature = res[i].repeat(word2ph[i], 1)
220
+ phone_level_feature.append(repeat_feature)
221
+ phone_level_feature = torch.cat(phone_level_feature, dim=0)
222
+ # if(is_half==True):phone_level_feature=phone_level_feature.half()
223
+ return phone_level_feature.T
224
+
225
+
226
+ def clean_text_inf(text, language):
227
+ phones, word2ph, norm_text = clean_text(text, language)
228
+ phones = cleaned_text_to_sequence(phones)
229
+ return phones, word2ph, norm_text
230
+
231
+
232
+ def get_bert_inf(phones, word2ph, norm_text, language):
233
+ language=language.replace("all_","")
234
+ if language == "zh":
235
+ bert = get_bert_feature(norm_text, word2ph).to(device)#.to(dtype)
236
+ else:
237
+ bert = torch.zeros(
238
+ (1024, len(phones)),
239
+ dtype=torch.float16 if is_half == True else torch.float32,
240
+ ).to(device)
241
+
242
+ return bert
243
+
244
+
245
+ def get_phones_and_bert(text,language):
246
+ if language in {"en","all_zh","all_ja"}:
247
+ language = language.replace("all_","")
248
+ if language == "en":
249
+ LangSegment.setfilters(["en"])
250
+ formattext = " ".join(tmp["text"] for tmp in LangSegment.getTexts(text))
251
+ else:
252
+ # 因无法区别中日文汉字,以用户输入为准
253
+ formattext = text
254
+ while " " in formattext:
255
+ formattext = formattext.replace(" ", " ")
256
+ phones, word2ph, norm_text = clean_text_inf(formattext, language)
257
+ if language == "zh":
258
+ bert = get_bert_feature(norm_text, word2ph).to(device)
259
+ else:
260
+ bert = torch.zeros(
261
+ (1024, len(phones)),
262
+ dtype=torch.float16 if is_half == True else torch.float32,
263
+ ).to(device)
264
+ elif language in {"zh", "ja","auto"}:
265
+ textlist=[]
266
+ langlist=[]
267
+ LangSegment.setfilters(["zh","ja","en","ko"])
268
+ if language == "auto":
269
+ for tmp in LangSegment.getTexts(text):
270
+ if tmp["lang"] == "ko":
271
+ langlist.append("zh")
272
+ textlist.append(tmp["text"])
273
+ else:
274
+ langlist.append(tmp["lang"])
275
+ textlist.append(tmp["text"])
276
+ else:
277
+ for tmp in LangSegment.getTexts(text):
278
+ if tmp["lang"] == "en":
279
+ langlist.append(tmp["lang"])
280
+ else:
281
+ # 因无法区别中日文汉字,以用户输入为准
282
+ langlist.append(language)
283
+ textlist.append(tmp["text"])
284
+ # logger.info(textlist)
285
+ # logger.info(langlist)
286
+ phones_list = []
287
+ bert_list = []
288
+ norm_text_list = []
289
+ for i in range(len(textlist)):
290
+ lang = langlist[i]
291
+ phones, word2ph, norm_text = clean_text_inf(textlist[i], lang)
292
+ bert = get_bert_inf(phones, word2ph, norm_text, lang)
293
+ phones_list.append(phones)
294
+ norm_text_list.append(norm_text)
295
+ bert_list.append(bert)
296
+ bert = torch.cat(bert_list, dim=1)
297
+ phones = sum(phones_list, [])
298
+ norm_text = ''.join(norm_text_list)
299
+
300
+ return phones,bert.to(torch.float16 if is_half == True else torch.float32),norm_text
301
+
302
+
303
+ class DictToAttrRecursive:
304
+ def __init__(self, input_dict):
305
+ for key, value in input_dict.items():
306
+ if isinstance(value, dict):
307
+ # 如果值是字典,递归调用构造函数
308
+ setattr(self, key, DictToAttrRecursive(value))
309
+ else:
310
+ setattr(self, key, value)
311
+
312
+
313
+ def get_spepc(hps, filename):
314
+ audio = load_audio(filename, int(hps.data.sampling_rate))
315
+ audio = torch.FloatTensor(audio)
316
+ audio_norm = audio
317
+ audio_norm = audio_norm.unsqueeze(0)
318
+ spec = spectrogram_torch(audio_norm, hps.data.filter_length, hps.data.sampling_rate, hps.data.hop_length,
319
+ hps.data.win_length, center=False)
320
+ return spec
321
+
322
+
323
+ def pack_audio(audio_bytes, data, rate):
324
+ if media_type == "ogg":
325
+ audio_bytes = pack_ogg(audio_bytes, data, rate)
326
+ elif media_type == "aac":
327
+ audio_bytes = pack_aac(audio_bytes, data, rate)
328
+ else:
329
+ # wav无法流式, 先暂存raw
330
+ audio_bytes = pack_raw(audio_bytes, data, rate)
331
+
332
+ return audio_bytes
333
+
334
+
335
+ def pack_ogg(audio_bytes, data, rate):
336
+ with sf.SoundFile(audio_bytes, mode='w', samplerate=rate, channels=1, format='ogg') as audio_file:
337
+ audio_file.write(data)
338
+
339
+ return audio_bytes
340
+
341
+
342
+ def pack_raw(audio_bytes, data, rate):
343
+ audio_bytes.write(data.tobytes())
344
+
345
+ return audio_bytes
346
+
347
+
348
+ def pack_wav(audio_bytes, rate):
349
+ data = np.frombuffer(audio_bytes.getvalue(),dtype=np.int16)
350
+ wav_bytes = BytesIO()
351
+ sf.write(wav_bytes, data, rate, format='wav')
352
+
353
+ return wav_bytes
354
+
355
+
356
+ def pack_aac(audio_bytes, data, rate):
357
+ process = subprocess.Popen([
358
+ 'ffmpeg',
359
+ '-f', 's16le', # 输入16位有符号小端整数PCM
360
+ '-ar', str(rate), # 设置采样率
361
+ '-ac', '1', # 单声道
362
+ '-i', 'pipe:0', # 从管道读取输入
363
+ '-c:a', 'aac', # 音频编码器为AAC
364
+ '-b:a', '192k', # 比特率
365
+ '-vn', # 不包含视频
366
+ '-f', 'adts', # 输出AAC数据流格式
367
+ 'pipe:1' # 将输出写入管道
368
+ ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
369
+ out, _ = process.communicate(input=data.tobytes())
370
+ audio_bytes.write(out)
371
+
372
+ return audio_bytes
373
+
374
+
375
+ def read_clean_buffer(audio_bytes):
376
+ audio_chunk = audio_bytes.getvalue()
377
+ audio_bytes.truncate(0)
378
+ audio_bytes.seek(0)
379
+
380
+ return audio_bytes, audio_chunk
381
+
382
+
383
+ def cut_text(text, punc):
384
+ punc_list = [p for p in punc if p in {",", ".", ";", "?", "!", "、", ",", "。", "?", "!", ";", ":", "…"}]
385
+ if len(punc_list) > 0:
386
+ punds = r"[" + "".join(punc_list) + r"]"
387
+ text = text.strip("\n")
388
+ items = re.split(f"({punds})", text)
389
+ mergeitems = ["".join(group) for group in zip(items[::2], items[1::2])]
390
+ # 在句子不存在符号或句尾无符号的时候保证文本完整
391
+ if len(items)%2 == 1:
392
+ mergeitems.append(items[-1])
393
+ text = "\n".join(mergeitems)
394
+
395
+ while "\n\n" in text:
396
+ text = text.replace("\n\n", "\n")
397
+
398
+ return text
399
+
400
+
401
+ def only_punc(text):
402
+ return not any(t.isalnum() or t.isalpha() for t in text)
403
+
404
+
405
+ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language):
406
+ t0 = ttime()
407
+ prompt_text = prompt_text.strip("\n")
408
+ prompt_language, text = prompt_language, text.strip("\n")
409
+ zero_wav = np.zeros(int(hps.data.sampling_rate * 0.3), dtype=np.float16 if is_half == True else np.float32)
410
+ with torch.no_grad():
411
+ wav16k, sr = librosa.load(ref_wav_path, sr=16000)
412
+ wav16k = torch.from_numpy(wav16k)
413
+ zero_wav_torch = torch.from_numpy(zero_wav)
414
+ if (is_half == True):
415
+ wav16k = wav16k.half().to(device)
416
+ zero_wav_torch = zero_wav_torch.half().to(device)
417
+ else:
418
+ wav16k = wav16k.to(device)
419
+ zero_wav_torch = zero_wav_torch.to(device)
420
+ wav16k = torch.cat([wav16k, zero_wav_torch])
421
+ ssl_content = ssl_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(1, 2) # .float()
422
+ codes = vq_model.extract_latent(ssl_content)
423
+ prompt_semantic = codes[0, 0]
424
+ t1 = ttime()
425
+ prompt_language = dict_language[prompt_language.lower()]
426
+ text_language = dict_language[text_language.lower()]
427
+ phones1, bert1, norm_text1 = get_phones_and_bert(prompt_text, prompt_language)
428
+ texts = text.split("\n")
429
+ audio_bytes = BytesIO()
430
+
431
+ for text in texts:
432
+ # 简单防止纯符号引发参考音频泄露
433
+ if only_punc(text):
434
+ continue
435
+
436
+ audio_opt = []
437
+ phones2, bert2, norm_text2 = get_phones_and_bert(text, text_language)
438
+ bert = torch.cat([bert1, bert2], 1)
439
+
440
+ all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0)
441
+ bert = bert.to(device).unsqueeze(0)
442
+ all_phoneme_len = torch.tensor([all_phoneme_ids.shape[-1]]).to(device)
443
+ prompt = prompt_semantic.unsqueeze(0).to(device)
444
+ t2 = ttime()
445
+ with torch.no_grad():
446
+ # pred_semantic = t2s_model.model.infer(
447
+ pred_semantic, idx = t2s_model.model.infer_panel(
448
+ all_phoneme_ids,
449
+ all_phoneme_len,
450
+ prompt,
451
+ bert,
452
+ # prompt_phone_len=ph_offset,
453
+ top_k=config['inference']['top_k'],
454
+ early_stop_num=hz * max_sec)
455
+ t3 = ttime()
456
+ # print(pred_semantic.shape,idx)
457
+ pred_semantic = pred_semantic[:, -idx:].unsqueeze(0) # .unsqueeze(0)#mq要多unsqueeze一次
458
+ refer = get_spepc(hps, ref_wav_path) # .to(device)
459
+ if (is_half == True):
460
+ refer = refer.half().to(device)
461
+ else:
462
+ refer = refer.to(device)
463
+ # audio = vq_model.decode(pred_semantic, all_phoneme_ids, refer).detach().cpu().numpy()[0, 0]
464
+ audio = \
465
+ vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0),
466
+ refer).detach().cpu().numpy()[
467
+ 0, 0] ###试试重建不带上prompt部分
468
+ audio_opt.append(audio)
469
+ audio_opt.append(zero_wav)
470
+ t4 = ttime()
471
+ audio_bytes = pack_audio(audio_bytes,(np.concatenate(audio_opt, 0) * 32768).astype(np.int16),hps.data.sampling_rate)
472
+ # logger.info("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
473
+ if stream_mode == "normal":
474
+ audio_bytes, audio_chunk = read_clean_buffer(audio_bytes)
475
+ yield audio_chunk
476
+
477
+ if not stream_mode == "normal":
478
+ if media_type == "wav":
479
+ audio_bytes = pack_wav(audio_bytes,hps.data.sampling_rate)
480
+ yield audio_bytes.getvalue()
481
+
482
+
483
+
484
+ def handle_control(command):
485
+ if command == "restart":
486
+ os.execl(g_config.python_exec, g_config.python_exec, *sys.argv)
487
+ elif command == "exit":
488
+ os.kill(os.getpid(), signal.SIGTERM)
489
+ exit(0)
490
+
491
+
492
+ def handle_change(path, text, language):
493
+ if is_empty(path, text, language):
494
+ return JSONResponse({"code": 400, "message": '缺少任意一项以下参数: "path", "text", "language"'}, status_code=400)
495
+
496
+ if path != "" or path is not None:
497
+ default_refer.path = path
498
+ if text != "" or text is not None:
499
+ default_refer.text = text
500
+ if language != "" or language is not None:
501
+ default_refer.language = language
502
+
503
+ logger.info(f"当前默认参考音频路径: {default_refer.path}")
504
+ logger.info(f"当前默认参考音频文本: {default_refer.text}")
505
+ logger.info(f"当前默认参考音频语种: {default_refer.language}")
506
+ logger.info(f"is_ready: {default_refer.is_ready()}")
507
+
508
+
509
+ return JSONResponse({"code": 0, "message": "Success"}, status_code=200)
510
+
511
+
512
+ def handle(refer_wav_path, prompt_text, prompt_language, text, text_language, cut_punc):
513
+ if (
514
+ refer_wav_path == "" or refer_wav_path is None
515
+ or prompt_text == "" or prompt_text is None
516
+ or prompt_language == "" or prompt_language is None
517
+ ):
518
+ refer_wav_path, prompt_text, prompt_language = (
519
+ default_refer.path,
520
+ default_refer.text,
521
+ default_refer.language,
522
+ )
523
+ if not default_refer.is_ready():
524
+ return JSONResponse({"code": 400, "message": "未指定参考音频且接口无预设"}, status_code=400)
525
+
526
+ if cut_punc == None:
527
+ text = cut_text(text,default_cut_punc)
528
+ else:
529
+ text = cut_text(text,cut_punc)
530
+
531
+ return StreamingResponse(get_tts_wav(refer_wav_path, prompt_text, prompt_language, text, text_language), media_type="audio/"+media_type)
532
+
533
+
534
+
535
+
536
+ # --------------------------------
537
+ # 初始化部分
538
+ # --------------------------------
539
+ now_dir = os.getcwd()
540
+ sys.path.append(now_dir)
541
+ sys.path.append("%s/GPT_SoVITS" % (now_dir))
542
+
543
+ dict_language = {
544
+ "中文": "all_zh",
545
+ "英文": "en",
546
+ "日文": "all_ja",
547
+ "中英混合": "zh",
548
+ "日英混合": "ja",
549
+ "多语种混合": "auto", #多语种启动切分识别语种
550
+ "all_zh": "all_zh",
551
+ "en": "en",
552
+ "all_ja": "all_ja",
553
+ "zh": "zh",
554
+ "ja": "ja",
555
+ "auto": "auto",
556
+ }
557
+
558
+ # logger
559
+ logging.config.dictConfig(uvicorn.config.LOGGING_CONFIG)
560
+ logger = logging.getLogger('uvicorn')
561
+
562
+ # 获取配置
563
+ g_config = global_config.Config()
564
+
565
+ # 获取参数
566
+ parser = argparse.ArgumentParser(description="GPT-SoVITS api")
567
+
568
+ parser.add_argument("-s", "--sovits_path", type=str, default=g_config.sovits_path, help="SoVITS模型路径")
569
+ parser.add_argument("-g", "--gpt_path", type=str, default=g_config.gpt_path, help="GPT模型路径")
570
+ parser.add_argument("-dr", "--default_refer_path", type=str, default="", help="默认参考音频路径")
571
+ parser.add_argument("-dt", "--default_refer_text", type=str, default="", help="默认参考音频文本")
572
+ parser.add_argument("-dl", "--default_refer_language", type=str, default="", help="默认参考音频语种")
573
+ parser.add_argument("-d", "--device", type=str, default=g_config.infer_device, help="cuda / cpu")
574
+ parser.add_argument("-a", "--bind_addr", type=str, default="0.0.0.0", help="default: 0.0.0.0")
575
+ parser.add_argument("-p", "--port", type=int, default=g_config.api_port, help="default: 9880")
576
+ parser.add_argument("-fp", "--full_precision", action="store_true", default=False, help="覆盖config.is_half为False, 使用全精度")
577
+ parser.add_argument("-hp", "--half_precision", action="store_true", default=False, help="覆盖config.is_half为True, 使用半精度")
578
+ # bool值的用法为 `python ./api.py -fp ...`
579
+ # 此时 full_precision==True, half_precision==False
580
+ parser.add_argument("-sm", "--stream_mode", type=str, default="close", help="流式返回模式, close / normal / keepalive")
581
+ parser.add_argument("-mt", "--media_type", type=str, default="wav", help="音频编码格式, wav / ogg / aac")
582
+ parser.add_argument("-cp", "--cut_punc", type=str, default="", help="文本切分符号设定, 符号范围,.;?!、,。?!;:…")
583
+ # 切割常用分句符为 `python ./api.py -cp ".?!。?!"`
584
+ parser.add_argument("-hb", "--hubert_path", type=str, default=g_config.cnhubert_path, help="覆盖config.cnhubert_path")
585
+ parser.add_argument("-b", "--bert_path", type=str, default=g_config.bert_path, help="覆盖config.bert_path")
586
+
587
+ args = parser.parse_args()
588
+ sovits_path = args.sovits_path
589
+ gpt_path = args.gpt_path
590
+ device = args.device
591
+ port = args.port
592
+ host = args.bind_addr
593
+ cnhubert_base_path = args.hubert_path
594
+ bert_path = args.bert_path
595
+ default_cut_punc = args.cut_punc
596
+
597
+ # 应用参数配置
598
+ default_refer = DefaultRefer(args.default_refer_path, args.default_refer_text, args.default_refer_language)
599
+
600
+ # 模型路径检查
601
+ if sovits_path == "":
602
+ sovits_path = g_config.pretrained_sovits_path
603
+ logger.warn(f"未指定SoVITS模型路径, fallback后当前值: {sovits_path}")
604
+ if gpt_path == "":
605
+ gpt_path = g_config.pretrained_gpt_path
606
+ logger.warn(f"未指定GPT模型路径, fallback后当前值: {gpt_path}")
607
+
608
+ # 指定默认参考音频, 调用方 未提供/未给全 参考音频参数时使用
609
+ if default_refer.path == "" or default_refer.text == "" or default_refer.language == "":
610
+ default_refer.path, default_refer.text, default_refer.language = "", "", ""
611
+ logger.info("未指定默认参考音频")
612
+ else:
613
+ logger.info(f"默认参考音频路径: {default_refer.path}")
614
+ logger.info(f"默认参考音频文本: {default_refer.text}")
615
+ logger.info(f"默认参考音频语种: {default_refer.language}")
616
+
617
+ # 获取半精度
618
+ is_half = g_config.is_half
619
+ if args.full_precision:
620
+ is_half = False
621
+ if args.half_precision:
622
+ is_half = True
623
+ if args.full_precision and args.half_precision:
624
+ is_half = g_config.is_half # 炒饭fallback
625
+ logger.info(f"半精: {is_half}")
626
+
627
+ # 流式返回模式
628
+ if args.stream_mode.lower() in ["normal","n"]:
629
+ stream_mode = "normal"
630
+ logger.info("流式返回已开启")
631
+ else:
632
+ stream_mode = "close"
633
+
634
+ # 音频编码格式
635
+ if args.media_type.lower() in ["aac","ogg"]:
636
+ media_type = args.media_type.lower()
637
+ elif stream_mode == "close":
638
+ media_type = "wav"
639
+ else:
640
+ media_type = "ogg"
641
+ logger.info(f"编码格式: {media_type}")
642
+
643
+ # 初始化模型
644
+ cnhubert.cnhubert_base_path = cnhubert_base_path
645
+ tokenizer = AutoTokenizer.from_pretrained(bert_path)
646
+ bert_model = AutoModelForMaskedLM.from_pretrained(bert_path)
647
+ ssl_model = cnhubert.get_model()
648
+ if is_half:
649
+ bert_model = bert_model.half().to(device)
650
+ ssl_model = ssl_model.half().to(device)
651
+ else:
652
+ bert_model = bert_model.to(device)
653
+ ssl_model = ssl_model.to(device)
654
+ change_sovits_weights(sovits_path)
655
+ change_gpt_weights(gpt_path)
656
+
657
+
658
+
659
+
660
+ # --------------------------------
661
+ # 接口部分
662
+ # --------------------------------
663
+ app = FastAPI()
664
+
665
+ @app.post("/set_model")
666
+ async def set_model(request: Request):
667
+ json_post_raw = await request.json()
668
+ global gpt_path
669
+ gpt_path=json_post_raw.get("gpt_model_path")
670
+ global sovits_path
671
+ sovits_path=json_post_raw.get("sovits_model_path")
672
+ logger.info("gptpath"+gpt_path+";vitspath"+sovits_path)
673
+ change_sovits_weights(sovits_path)
674
+ change_gpt_weights(gpt_path)
675
+ return "ok"
676
+
677
+
678
+ @app.post("/control")
679
+ async def control(request: Request):
680
+ json_post_raw = await request.json()
681
+ return handle_control(json_post_raw.get("command"))
682
+
683
+
684
+ @app.get("/control")
685
+ async def control(command: str = None):
686
+ return handle_control(command)
687
+
688
+
689
+ @app.post("/change_refer")
690
+ async def change_refer(request: Request):
691
+ json_post_raw = await request.json()
692
+ return handle_change(
693
+ json_post_raw.get("refer_wav_path"),
694
+ json_post_raw.get("prompt_text"),
695
+ json_post_raw.get("prompt_language")
696
+ )
697
+
698
+
699
+ @app.get("/change_refer")
700
+ async def change_refer(
701
+ refer_wav_path: str = None,
702
+ prompt_text: str = None,
703
+ prompt_language: str = None
704
+ ):
705
+ return handle_change(refer_wav_path, prompt_text, prompt_language)
706
+
707
+
708
+ @app.post("/")
709
+ async def tts_endpoint(request: Request):
710
+ json_post_raw = await request.json()
711
+ return handle(
712
+ json_post_raw.get("refer_wav_path"),
713
+ json_post_raw.get("prompt_text"),
714
+ json_post_raw.get("prompt_language"),
715
+ json_post_raw.get("text"),
716
+ json_post_raw.get("text_language"),
717
+ json_post_raw.get("cut_punc"),
718
+ )
719
+
720
+
721
+ @app.get("/")
722
+ async def tts_endpoint(
723
+ refer_wav_path: str = None,
724
+ prompt_text: str = None,
725
+ prompt_language: str = None,
726
+ text: str = None,
727
+ text_language: str = None,
728
+ cut_punc: str = None,
729
+ ):
730
+ return handle(refer_wav_path, prompt_text, prompt_language, text, text_language, cut_punc)
731
+
732
+
733
+ if __name__ == "__main__":
734
+ uvicorn.run(app, host=host, port=port, workers=1)
api_v2.py ADDED
@@ -0,0 +1,453 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ # WebAPI文档
3
+
4
+ ` python api_v2.py -a 127.0.0.1 -p 9880 -c GPT_SoVITS/configs/tts_infer.yaml `
5
+
6
+ ## 执行参数:
7
+ `-a` - `绑定地址, 默认"127.0.0.1"`
8
+ `-p` - `绑定端口, 默认9880`
9
+ `-c` - `TTS配置文件路径, 默认"GPT_SoVITS/configs/tts_infer.yaml"`
10
+
11
+ ## 调用:
12
+
13
+ ### 推理
14
+
15
+ endpoint: `/tts`
16
+ GET:
17
+ ```
18
+ http://127.0.0.1:9880/tts?text=先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。&text_lang=zh&ref_audio_path=archive_jingyuan_1.wav&prompt_lang=zh&prompt_text=我是「罗浮」云骑将军景元。不必拘谨,「将军」只是一时的身份,你称呼我景元便可&text_split_method=cut5&batch_size=1&media_type=wav&streaming_mode=true
19
+ ```
20
+
21
+ POST:
22
+ ```json
23
+ {
24
+ "text": "", # str.(required) text to be synthesized
25
+ "text_lang": "", # str.(required) language of the text to be synthesized
26
+ "ref_audio_path": "", # str.(required) reference audio path.
27
+ "prompt_text": "", # str.(optional) prompt text for the reference audio
28
+ "prompt_lang": "", # str.(required) language of the prompt text for the reference audio
29
+ "top_k": 5, # int.(optional) top k sampling
30
+ "top_p": 1, # float.(optional) top p sampling
31
+ "temperature": 1, # float.(optional) temperature for sampling
32
+ "text_split_method": "cut5", # str.(optional) text split method, see text_segmentation_method.py for details.
33
+ "batch_size": 1, # int.(optional) batch size for inference
34
+ "batch_threshold": 0.75, # float.(optional) threshold for batch splitting.
35
+ "split_bucket": true, # bool.(optional) whether to split the batch into multiple buckets.
36
+ "speed_factor":1.0, # float.(optional) control the speed of the synthesized audio.
37
+ "fragment_interval":0.3, # float.(optional) to control the interval of the audio fragment.
38
+ "seed": -1, # int.(optional) random seed for reproducibility.
39
+ "media_type": "wav", # str.(optional) media type of the output audio, support "wav", "raw", "ogg", "aac".
40
+ "streaming_mode": false, # bool.(optional) whether to return a streaming response.
41
+ "parallel_infer": True, # bool.(optional) whether to use parallel inference.
42
+ "repetition_penalty": 1.35 # float.(optional) repetition penalty for T2S model.
43
+ }
44
+ ```
45
+
46
+ RESP:
47
+ 成功: 直接返回 wav 音频流, http code 200
48
+ 失败: 返回包含错误信息的 json, http code 400
49
+
50
+ ### 命令控制
51
+
52
+ endpoint: `/control`
53
+
54
+ command:
55
+ "restart": 重新运行
56
+ "exit": 结束运行
57
+
58
+ GET:
59
+ ```
60
+ http://127.0.0.1:9880/control?command=restart
61
+ ```
62
+ POST:
63
+ ```json
64
+ {
65
+ "command": "restart"
66
+ }
67
+ ```
68
+
69
+ RESP: 无
70
+
71
+
72
+ ### 切换GPT模型
73
+
74
+ endpoint: `/set_gpt_weights`
75
+
76
+ GET:
77
+ ```
78
+ http://127.0.0.1:9880/set_gpt_weights?weights_path=GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
79
+ ```
80
+ RESP:
81
+ 成功: 返回"success", http code 200
82
+ 失败: 返回包含错误信息的 json, http code 400
83
+
84
+
85
+ ### 切换Sovits模型
86
+
87
+ endpoint: `/set_sovits_weights`
88
+
89
+ GET:
90
+ ```
91
+ http://127.0.0.1:9880/set_sovits_weights?weights_path=GPT_SoVITS/pretrained_models/s2G488k.pth
92
+ ```
93
+
94
+ RESP:
95
+ 成功: 返回"success", http code 200
96
+ 失败: 返回包含错误信息的 json, http code 400
97
+
98
+ """
99
+ import os
100
+ import sys
101
+ import traceback
102
+ from typing import Generator
103
+
104
+ now_dir = os.getcwd()
105
+ sys.path.append(now_dir)
106
+ sys.path.append("%s/GPT_SoVITS" % (now_dir))
107
+
108
+ import argparse
109
+ import subprocess
110
+ import wave
111
+ import signal
112
+ import numpy as np
113
+ import soundfile as sf
114
+ from fastapi import FastAPI, Request, HTTPException, Response
115
+ from fastapi.responses import StreamingResponse, JSONResponse
116
+ from fastapi import FastAPI, UploadFile, File
117
+ import uvicorn
118
+ from io import BytesIO
119
+ from tools.i18n.i18n import I18nAuto
120
+ from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config
121
+ from GPT_SoVITS.TTS_infer_pack.text_segmentation_method import get_method_names as get_cut_method_names
122
+ from fastapi.responses import StreamingResponse
123
+ from pydantic import BaseModel
124
+ # print(sys.path)
125
+ i18n = I18nAuto()
126
+ cut_method_names = get_cut_method_names()
127
+
128
+ parser = argparse.ArgumentParser(description="GPT-SoVITS api")
129
+ parser.add_argument("-c", "--tts_config", type=str, default="GPT_SoVITS/configs/tts_infer.yaml", help="tts_infer路径")
130
+ parser.add_argument("-a", "--bind_addr", type=str, default="127.0.0.1", help="default: 127.0.0.1")
131
+ parser.add_argument("-p", "--port", type=int, default="9880", help="default: 9880")
132
+ args = parser.parse_args()
133
+ config_path = args.tts_config
134
+ # device = args.device
135
+ port = args.port
136
+ host = args.bind_addr
137
+ argv = sys.argv
138
+
139
+ if config_path in [None, ""]:
140
+ config_path = "GPT-SoVITS/configs/tts_infer.yaml"
141
+
142
+ tts_config = TTS_Config(config_path)
143
+ tts_pipeline = TTS(tts_config)
144
+
145
+ APP = FastAPI()
146
+ class TTS_Request(BaseModel):
147
+ text: str = None
148
+ text_lang: str = None
149
+ ref_audio_path: str = None
150
+ prompt_lang: str = None
151
+ prompt_text: str = ""
152
+ top_k:int = 5
153
+ top_p:float = 1
154
+ temperature:float = 1
155
+ text_split_method:str = "cut5"
156
+ batch_size:int = 1
157
+ batch_threshold:float = 0.75
158
+ split_bucket:bool = True
159
+ speed_factor:float = 1.0
160
+ fragment_interval:float = 0.3
161
+ seed:int = -1
162
+ media_type:str = "wav"
163
+ streaming_mode:bool = False
164
+ parallel_infer:bool = True
165
+ repetition_penalty:float = 1.35
166
+
167
+ ### modify from https://github.com/RVC-Boss/GPT-SoVITS/pull/894/files
168
+ def pack_ogg(io_buffer:BytesIO, data:np.ndarray, rate:int):
169
+ with sf.SoundFile(io_buffer, mode='w', samplerate=rate, channels=1, format='ogg') as audio_file:
170
+ audio_file.write(data)
171
+ return io_buffer
172
+
173
+
174
+ def pack_raw(io_buffer:BytesIO, data:np.ndarray, rate:int):
175
+ io_buffer.write(data.tobytes())
176
+ return io_buffer
177
+
178
+
179
+ def pack_wav(io_buffer:BytesIO, data:np.ndarray, rate:int):
180
+ io_buffer = BytesIO()
181
+ sf.write(io_buffer, data, rate, format='wav')
182
+ return io_buffer
183
+
184
+ def pack_aac(io_buffer:BytesIO, data:np.ndarray, rate:int):
185
+ process = subprocess.Popen([
186
+ 'ffmpeg',
187
+ '-f', 's16le', # 输入16位有符号小端整数PCM
188
+ '-ar', str(rate), # 设置采样率
189
+ '-ac', '1', # 单声道
190
+ '-i', 'pipe:0', # 从管道读取输入
191
+ '-c:a', 'aac', # 音频编码器为AAC
192
+ '-b:a', '192k', # 比特率
193
+ '-vn', # 不包含视频
194
+ '-f', 'adts', # 输出AAC数据流格式
195
+ 'pipe:1' # 将输出写入管道
196
+ ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
197
+ out, _ = process.communicate(input=data.tobytes())
198
+ io_buffer.write(out)
199
+ return io_buffer
200
+
201
+ def pack_audio(io_buffer:BytesIO, data:np.ndarray, rate:int, media_type:str):
202
+ if media_type == "ogg":
203
+ io_buffer = pack_ogg(io_buffer, data, rate)
204
+ elif media_type == "aac":
205
+ io_buffer = pack_aac(io_buffer, data, rate)
206
+ elif media_type == "wav":
207
+ io_buffer = pack_wav(io_buffer, data, rate)
208
+ else:
209
+ io_buffer = pack_raw(io_buffer, data, rate)
210
+ io_buffer.seek(0)
211
+ return io_buffer
212
+
213
+
214
+
215
+ # from https://huggingface.co/spaces/coqui/voice-chat-with-mistral/blob/main/app.py
216
+ def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=32000):
217
+ # This will create a wave header then append the frame input
218
+ # It should be first on a streaming wav file
219
+ # Other frames better should not have it (else you will hear some artifacts each chunk start)
220
+ wav_buf = BytesIO()
221
+ with wave.open(wav_buf, "wb") as vfout:
222
+ vfout.setnchannels(channels)
223
+ vfout.setsampwidth(sample_width)
224
+ vfout.setframerate(sample_rate)
225
+ vfout.writeframes(frame_input)
226
+
227
+ wav_buf.seek(0)
228
+ return wav_buf.read()
229
+
230
+
231
+ def handle_control(command:str):
232
+ if command == "restart":
233
+ os.execl(sys.executable, sys.executable, *argv)
234
+ elif command == "exit":
235
+ os.kill(os.getpid(), signal.SIGTERM)
236
+ exit(0)
237
+
238
+
239
+ def check_params(req:dict):
240
+ text:str = req.get("text", "")
241
+ text_lang:str = req.get("text_lang", "")
242
+ ref_audio_path:str = req.get("ref_audio_path", "")
243
+ streaming_mode:bool = req.get("streaming_mode", False)
244
+ media_type:str = req.get("media_type", "wav")
245
+ prompt_lang:str = req.get("prompt_lang", "")
246
+ text_split_method:str = req.get("text_split_method", "cut5")
247
+
248
+ if ref_audio_path in [None, ""]:
249
+ return JSONResponse(status_code=400, content={"message": "ref_audio_path is required"})
250
+ if text in [None, ""]:
251
+ return JSONResponse(status_code=400, content={"message": "text is required"})
252
+ if (text_lang in [None, ""]) :
253
+ return JSONResponse(status_code=400, content={"message": "text_lang is required"})
254
+ elif text_lang.lower() not in tts_config.languages:
255
+ return JSONResponse(status_code=400, content={"message": "text_lang is not supported"})
256
+ if (prompt_lang in [None, ""]) :
257
+ return JSONResponse(status_code=400, content={"message": "prompt_lang is required"})
258
+ elif prompt_lang.lower() not in tts_config.languages:
259
+ return JSONResponse(status_code=400, content={"message": "prompt_lang is not supported"})
260
+ if media_type not in ["wav", "raw", "ogg", "aac"]:
261
+ return JSONResponse(status_code=400, content={"message": "media_type is not supported"})
262
+ elif media_type == "ogg" and not streaming_mode:
263
+ return JSONResponse(status_code=400, content={"message": "ogg format is not supported in non-streaming mode"})
264
+
265
+ if text_split_method not in cut_method_names:
266
+ return JSONResponse(status_code=400, content={"message": f"text_split_method:{text_split_method} is not supported"})
267
+
268
+ return None
269
+
270
+ async def tts_handle(req:dict):
271
+ """
272
+ Text to speech handler.
273
+
274
+ Args:
275
+ req (dict):
276
+ {
277
+ "text": "", # str.(required) text to be synthesized
278
+ "text_lang: "", # str.(required) language of the text to be synthesized
279
+ "ref_audio_path": "", # str.(required) reference audio path
280
+ "prompt_text": "", # str.(optional) prompt text for the reference audio
281
+ "prompt_lang": "", # str.(required) language of the prompt text for the reference audio
282
+ "top_k": 5, # int. top k sampling
283
+ "top_p": 1, # float. top p sampling
284
+ "temperature": 1, # float. temperature for sampling
285
+ "text_split_method": "cut5", # str. text split method, see text_segmentation_method.py for details.
286
+ "batch_size": 1, # int. batch size for inference
287
+ "batch_threshold": 0.75, # float. threshold for batch splitting.
288
+ "split_bucket: True, # bool. whether to split the batch into multiple buckets.
289
+ "speed_factor":1.0, # float. control the speed of the synthesized audio.
290
+ "fragment_interval":0.3, # float. to control the interval of the audio fragment.
291
+ "seed": -1, # int. random seed for reproducibility.
292
+ "media_type": "wav", # str. media type of the output audio, support "wav", "raw", "ogg", "aac".
293
+ "streaming_mode": False, # bool. whether to return a streaming response.
294
+ "parallel_infer": True, # bool.(optional) whether to use parallel inference.
295
+ "repetition_penalty": 1.35 # float.(optional) repetition penalty for T2S model.
296
+ }
297
+ returns:
298
+ StreamingResponse: audio stream response.
299
+ """
300
+
301
+ streaming_mode = req.get("streaming_mode", False)
302
+ media_type = req.get("media_type", "wav")
303
+
304
+ check_res = check_params(req)
305
+ if check_res is not None:
306
+ return check_res
307
+
308
+ if streaming_mode:
309
+ req["return_fragment"] = True
310
+
311
+ try:
312
+ tts_generator=tts_pipeline.run(req)
313
+
314
+ if streaming_mode:
315
+ def streaming_generator(tts_generator:Generator, media_type:str):
316
+ if media_type == "wav":
317
+ yield wave_header_chunk()
318
+ media_type = "raw"
319
+ for sr, chunk in tts_generator:
320
+ yield pack_audio(BytesIO(), chunk, sr, media_type).getvalue()
321
+ # _media_type = f"audio/{media_type}" if not (streaming_mode and media_type in ["wav", "raw"]) else f"audio/x-{media_type}"
322
+ return StreamingResponse(streaming_generator(tts_generator, media_type, ), media_type=f"audio/{media_type}")
323
+
324
+ else:
325
+ sr, audio_data = next(tts_generator)
326
+ audio_data = pack_audio(BytesIO(), audio_data, sr, media_type).getvalue()
327
+ return Response(audio_data, media_type=f"audio/{media_type}")
328
+ except Exception as e:
329
+ return JSONResponse(status_code=400, content={"message": f"tts failed", "Exception": str(e)})
330
+
331
+
332
+
333
+
334
+
335
+
336
+ @APP.get("/control")
337
+ async def control(command: str = None):
338
+ if command is None:
339
+ return JSONResponse(status_code=400, content={"message": "command is required"})
340
+ handle_control(command)
341
+
342
+
343
+
344
+ @APP.get("/tts")
345
+ async def tts_get_endpoint(
346
+ text: str = None,
347
+ text_lang: str = None,
348
+ ref_audio_path: str = None,
349
+ prompt_lang: str = None,
350
+ prompt_text: str = "",
351
+ top_k:int = 5,
352
+ top_p:float = 1,
353
+ temperature:float = 1,
354
+ text_split_method:str = "cut0",
355
+ batch_size:int = 1,
356
+ batch_threshold:float = 0.75,
357
+ split_bucket:bool = True,
358
+ speed_factor:float = 1.0,
359
+ fragment_interval:float = 0.3,
360
+ seed:int = -1,
361
+ media_type:str = "wav",
362
+ streaming_mode:bool = False,
363
+ parallel_infer:bool = True,
364
+ repetition_penalty:float = 1.35
365
+ ):
366
+ req = {
367
+ "text": text,
368
+ "text_lang": text_lang.lower(),
369
+ "ref_audio_path": ref_audio_path,
370
+ "prompt_text": prompt_text,
371
+ "prompt_lang": prompt_lang.lower(),
372
+ "top_k": top_k,
373
+ "top_p": top_p,
374
+ "temperature": temperature,
375
+ "text_split_method": text_split_method,
376
+ "batch_size":int(batch_size),
377
+ "batch_threshold":float(batch_threshold),
378
+ "speed_factor":float(speed_factor),
379
+ "split_bucket":split_bucket,
380
+ "fragment_interval":fragment_interval,
381
+ "seed":seed,
382
+ "media_type":media_type,
383
+ "streaming_mode":streaming_mode,
384
+ "parallel_infer":parallel_infer,
385
+ "repetition_penalty":float(repetition_penalty)
386
+ }
387
+ return await tts_handle(req)
388
+
389
+
390
+ @APP.post("/tts")
391
+ async def tts_post_endpoint(request: TTS_Request):
392
+ req = request.dict()
393
+ return await tts_handle(req)
394
+
395
+
396
+ @APP.get("/set_refer_audio")
397
+ async def set_refer_aduio(refer_audio_path: str = None):
398
+ try:
399
+ tts_pipeline.set_ref_audio(refer_audio_path)
400
+ except Exception as e:
401
+ return JSONResponse(status_code=400, content={"message": f"set refer audio failed", "Exception": str(e)})
402
+ return JSONResponse(status_code=200, content={"message": "success"})
403
+
404
+
405
+ # @APP.post("/set_refer_audio")
406
+ # async def set_refer_aduio_post(audio_file: UploadFile = File(...)):
407
+ # try:
408
+ # # 检查文件类型,确保是音频文件
409
+ # if not audio_file.content_type.startswith("audio/"):
410
+ # return JSONResponse(status_code=400, content={"message": "file type is not supported"})
411
+
412
+ # os.makedirs("uploaded_audio", exist_ok=True)
413
+ # save_path = os.path.join("uploaded_audio", audio_file.filename)
414
+ # # 保存音频文件到服务器上的一个目录
415
+ # with open(save_path , "wb") as buffer:
416
+ # buffer.write(await audio_file.read())
417
+
418
+ # tts_pipeline.set_ref_audio(save_path)
419
+ # except Exception as e:
420
+ # return JSONResponse(status_code=400, content={"message": f"set refer audio failed", "Exception": str(e)})
421
+ # return JSONResponse(status_code=200, content={"message": "success"})
422
+
423
+ @APP.get("/set_gpt_weights")
424
+ async def set_gpt_weights(weights_path: str = None):
425
+ try:
426
+ if weights_path in ["", None]:
427
+ return JSONResponse(status_code=400, content={"message": "gpt weight path is required"})
428
+ tts_pipeline.init_t2s_weights(weights_path)
429
+ except Exception as e:
430
+ return JSONResponse(status_code=400, content={"message": f"change gpt weight failed", "Exception": str(e)})
431
+
432
+ return JSONResponse(status_code=200, content={"message": "success"})
433
+
434
+
435
+ @APP.get("/set_sovits_weights")
436
+ async def set_sovits_weights(weights_path: str = None):
437
+ try:
438
+ if weights_path in ["", None]:
439
+ return JSONResponse(status_code=400, content={"message": "sovits weight path is required"})
440
+ tts_pipeline.init_vits_weights(weights_path)
441
+ except Exception as e:
442
+ return JSONResponse(status_code=400, content={"message": f"change sovits weight failed", "Exception": str(e)})
443
+ return JSONResponse(status_code=200, content={"message": "success"})
444
+
445
+
446
+
447
+ if __name__ == "__main__":
448
+ try:
449
+ uvicorn.run(APP, host=host, port=port, workers=1)
450
+ except Exception as e:
451
+ traceback.print_exc()
452
+ os.kill(os.getpid(), signal.SIGTERM)
453
+ exit(0)
config.py CHANGED
@@ -1,66 +1,66 @@
1
- import sys,os
2
-
3
- import torch
4
-
5
- # 推理用的指定模型
6
- sovits_path = ""
7
- gpt_path = ""
8
- is_half_str = os.environ.get("is_half", "True")
9
- is_half = True if is_half_str.lower() == 'true' else False
10
- is_share_str = os.environ.get("is_share","False")
11
- is_share= True if is_share_str.lower() == 'true' else False
12
-
13
- cnhubert_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base"
14
- bert_path = "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"
15
- pretrained_sovits_path = "GPT_SoVITS/pretrained_models/s2G488k.pth"
16
- pretrained_gpt_path = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
17
-
18
- exp_root = "logs"
19
- python_exec = sys.executable or "python"
20
- if torch.cuda.is_available():
21
- infer_device = "cuda"
22
- else:
23
- infer_device = "cpu"
24
-
25
- webui_port_main = 9874
26
- webui_port_uvr5 = 9873
27
- webui_port_infer_tts = 9872
28
- webui_port_subfix = 9871
29
-
30
- api_port = 9880
31
-
32
- if infer_device == "cuda":
33
- gpu_name = torch.cuda.get_device_name(0)
34
- if (
35
- ("16" in gpu_name and "V100" not in gpu_name.upper())
36
- or "P40" in gpu_name.upper()
37
- or "P10" in gpu_name.upper()
38
- or "1060" in gpu_name
39
- or "1070" in gpu_name
40
- or "1080" in gpu_name
41
- ):
42
- is_half=False
43
-
44
- if(infer_device=="cpu"):is_half=False
45
-
46
- class Config:
47
- def __init__(self):
48
- self.sovits_path = sovits_path
49
- self.gpt_path = gpt_path
50
- self.is_half = is_half
51
-
52
- self.cnhubert_path = cnhubert_path
53
- self.bert_path = bert_path
54
- self.pretrained_sovits_path = pretrained_sovits_path
55
- self.pretrained_gpt_path = pretrained_gpt_path
56
-
57
- self.exp_root = exp_root
58
- self.python_exec = python_exec
59
- self.infer_device = infer_device
60
-
61
- self.webui_port_main = webui_port_main
62
- self.webui_port_uvr5 = webui_port_uvr5
63
- self.webui_port_infer_tts = webui_port_infer_tts
64
- self.webui_port_subfix = webui_port_subfix
65
-
66
- self.api_port = api_port
 
1
+ import sys,os
2
+
3
+ import torch
4
+
5
+ # 推理用的指定模型
6
+ sovits_path = ""
7
+ gpt_path = ""
8
+ is_half_str = os.environ.get("is_half", "True")
9
+ is_half = True if is_half_str.lower() == 'true' else False
10
+ is_share_str = os.environ.get("is_share","False")
11
+ is_share= True if is_share_str.lower() == 'true' else False
12
+
13
+ cnhubert_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base"
14
+ bert_path = "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"
15
+ pretrained_sovits_path = "GPT_SoVITS/pretrained_models/s2G488k.pth"
16
+ pretrained_gpt_path = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
17
+
18
+ exp_root = "logs"
19
+ python_exec = sys.executable or "python"
20
+ if torch.cuda.is_available():
21
+ infer_device = "cuda"
22
+ else:
23
+ infer_device = "cpu"
24
+
25
+ webui_port_main = 9874
26
+ webui_port_uvr5 = 9873
27
+ webui_port_infer_tts = 9872
28
+ webui_port_subfix = 9871
29
+
30
+ api_port = 9880
31
+
32
+ if infer_device == "cuda":
33
+ gpu_name = torch.cuda.get_device_name(0)
34
+ if (
35
+ ("16" in gpu_name and "V100" not in gpu_name.upper())
36
+ or "P40" in gpu_name.upper()
37
+ or "P10" in gpu_name.upper()
38
+ or "1060" in gpu_name
39
+ or "1070" in gpu_name
40
+ or "1080" in gpu_name
41
+ ):
42
+ is_half=False
43
+
44
+ if(infer_device=="cpu"):is_half=False
45
+
46
+ class Config:
47
+ def __init__(self):
48
+ self.sovits_path = sovits_path
49
+ self.gpt_path = gpt_path
50
+ self.is_half = is_half
51
+
52
+ self.cnhubert_path = cnhubert_path
53
+ self.bert_path = bert_path
54
+ self.pretrained_sovits_path = pretrained_sovits_path
55
+ self.pretrained_gpt_path = pretrained_gpt_path
56
+
57
+ self.exp_root = exp_root
58
+ self.python_exec = python_exec
59
+ self.infer_device = infer_device
60
+
61
+ self.webui_port_main = webui_port_main
62
+ self.webui_port_uvr5 = webui_port_uvr5
63
+ self.webui_port_infer_tts = webui_port_infer_tts
64
+ self.webui_port_subfix = webui_port_subfix
65
+
66
+ self.api_port = api_port
go-webui.bat CHANGED
@@ -1,2 +1,2 @@
1
- runtime\python.exe webui.py
2
- pause
 
1
+ runtime\python.exe webui.py
2
+ pause
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- numpy # OpenVoice: numpy==1.22.0
2
  scipy
3
  tensorboard
4
  librosa==0.9.2
@@ -25,14 +25,4 @@ jieba_fast
25
  jieba
26
  LangSegment>=0.2.0
27
  Faster_Whisper
28
- wordsegment
29
- faster-whisper==0.9.0
30
- pydub==0.25.1
31
- wavmark==0.0.3
32
- eng_to_ipa==0.0.2
33
- inflect==7.0.0
34
- unidecode==1.3.7
35
- whisper-timestamped==1.14.2
36
- openai
37
- python-dotenv
38
- langid==1.1.6
 
1
+ numpy
2
  scipy
3
  tensorboard
4
  librosa==0.9.2
 
25
  jieba
26
  LangSegment>=0.2.0
27
  Faster_Whisper
28
+ wordsegment
 
 
 
 
 
 
 
 
 
 
tools/asr/fasterwhisper_asr.py CHANGED
@@ -1,18 +1,16 @@
1
  import argparse
2
  import os
3
- os.environ["HF_ENDPOINT"]="https://hf-mirror.com"
4
  import traceback
5
- import requests
6
- from glob import glob
7
- import torch
8
 
 
 
 
 
9
  from faster_whisper import WhisperModel
10
  from tqdm import tqdm
11
 
12
  from tools.asr.config import check_fw_local_models
13
 
14
- os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
15
-
16
  language_code_list = [
17
  "af", "am", "ar", "as", "az",
18
  "ba", "be", "bg", "bn", "bo",
@@ -36,7 +34,7 @@ language_code_list = [
36
  "vi", "yi", "yo", "zh", "yue",
37
  "auto"]
38
 
39
- def execute_asr(input_folder, output_folder, model_size, language,precision):
40
  if '-local' in model_size:
41
  model_size = model_size[:-6]
42
  model_path = f'tools/asr/models/faster-whisper-{model_size}'
@@ -50,17 +48,18 @@ def execute_asr(input_folder, output_folder, model_size, language,precision):
50
  model = WhisperModel(model_path, device=device, compute_type=precision)
51
  except:
52
  return print(traceback.format_exc())
 
 
 
 
53
  output = []
54
  output_file_name = os.path.basename(input_folder)
55
- output_file_path = os.path.abspath(f'{output_folder}/{output_file_name}.list')
56
-
57
- if not os.path.exists(output_folder):
58
- os.makedirs(output_folder)
59
-
60
- for file in tqdm(glob(os.path.join(input_folder, '**/*.wav'), recursive=True)):
61
  try:
 
62
  segments, info = model.transcribe(
63
- audio = file,
64
  beam_size = 5,
65
  vad_filter = True,
66
  vad_parameters = dict(min_silence_duration_ms=700),
@@ -68,18 +67,23 @@ def execute_asr(input_folder, output_folder, model_size, language,precision):
68
  text = ''
69
 
70
  if info.language == "zh":
71
- print("检测为中文文本,转funasr处理")
72
  if("only_asr"not in globals()):
73
- from tools.asr.funasr_asr import only_asr##如果用英文就不需要导入下载模型
74
- text = only_asr(file)
 
75
 
76
  if text == '':
77
  for segment in segments:
78
  text += segment.text
79
- output.append(f"{file}|{output_file_name}|{info.language.upper()}|{text}")
80
  except:
81
  return print(traceback.format_exc())
82
-
 
 
 
 
83
  with open(output_file_path, "w", encoding="utf-8") as f:
84
  f.write("\n".join(output))
85
  print(f"ASR 任务完成->标注文件路径: {output_file_path}\n")
 
1
  import argparse
2
  import os
 
3
  import traceback
 
 
 
4
 
5
+ os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
6
+ os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
7
+
8
+ import torch
9
  from faster_whisper import WhisperModel
10
  from tqdm import tqdm
11
 
12
  from tools.asr.config import check_fw_local_models
13
 
 
 
14
  language_code_list = [
15
  "af", "am", "ar", "as", "az",
16
  "ba", "be", "bg", "bn", "bo",
 
34
  "vi", "yi", "yo", "zh", "yue",
35
  "auto"]
36
 
37
+ def execute_asr(input_folder, output_folder, model_size, language, precision):
38
  if '-local' in model_size:
39
  model_size = model_size[:-6]
40
  model_path = f'tools/asr/models/faster-whisper-{model_size}'
 
48
  model = WhisperModel(model_path, device=device, compute_type=precision)
49
  except:
50
  return print(traceback.format_exc())
51
+
52
+ input_file_names = os.listdir(input_folder)
53
+ input_file_names.sort()
54
+
55
  output = []
56
  output_file_name = os.path.basename(input_folder)
57
+
58
+ for file_name in tqdm(input_file_names):
 
 
 
 
59
  try:
60
+ file_path = os.path.join(input_folder, file_name)
61
  segments, info = model.transcribe(
62
+ audio = file_path,
63
  beam_size = 5,
64
  vad_filter = True,
65
  vad_parameters = dict(min_silence_duration_ms=700),
 
67
  text = ''
68
 
69
  if info.language == "zh":
70
+ print("检测为中文文本, 转 FunASR 处理")
71
  if("only_asr"not in globals()):
72
+ from tools.asr.funasr_asr import \
73
+ only_asr # #如果用英文就不需要导入下载模型
74
+ text = only_asr(file_path)
75
 
76
  if text == '':
77
  for segment in segments:
78
  text += segment.text
79
+ output.append(f"{file_path}|{output_file_name}|{info.language.upper()}|{text}")
80
  except:
81
  return print(traceback.format_exc())
82
+
83
+ output_folder = output_folder or "output/asr_opt"
84
+ os.makedirs(output_folder, exist_ok=True)
85
+ output_file_path = os.path.abspath(f'{output_folder}/{output_file_name}.list')
86
+
87
  with open(output_file_path, "w", encoding="utf-8") as f:
88
  f.write("\n".join(output))
89
  print(f"ASR 任务完成->标注文件路径: {output_file_path}\n")
tools/asr/funasr_asr.py CHANGED
@@ -38,10 +38,11 @@ def execute_asr(input_folder, output_folder, model_size, language):
38
  output = []
39
  output_file_name = os.path.basename(input_folder)
40
 
41
- for name in tqdm(input_file_names):
42
  try:
43
- text = model.generate(input="%s/%s"%(input_folder, name))[0]["text"]
44
- output.append(f"{input_folder}/{name}|{output_file_name}|{language.upper()}|{text}")
 
45
  except:
46
  print(traceback.format_exc())
47
 
 
38
  output = []
39
  output_file_name = os.path.basename(input_folder)
40
 
41
+ for file_name in tqdm(input_file_names):
42
  try:
43
+ file_path = os.path.join(input_folder, file_name)
44
+ text = model.generate(input=file_path)[0]["text"]
45
+ output.append(f"{file_path}|{output_file_name}|{language.upper()}|{text}")
46
  except:
47
  print(traceback.format_exc())
48
 
tools/cmd-denoise.py CHANGED
@@ -1,29 +1,29 @@
1
- import os,argparse
2
-
3
- from modelscope.pipelines import pipeline
4
- from modelscope.utils.constant import Tasks
5
- from tqdm import tqdm
6
-
7
- path_denoise = 'tools/denoise-model/speech_frcrn_ans_cirm_16k'
8
- path_denoise = path_denoise if os.path.exists(path_denoise) else "damo/speech_frcrn_ans_cirm_16k"
9
- ans = pipeline(Tasks.acoustic_noise_suppression,model=path_denoise)
10
- def execute_denoise(input_folder,output_folder):
11
- os.makedirs(output_folder,exist_ok=True)
12
- # print(input_folder)
13
- # print(list(os.listdir(input_folder).sort()))
14
- for name in tqdm(os.listdir(input_folder)):
15
- ans("%s/%s"%(input_folder,name),output_path='%s/%s'%(output_folder,name))
16
-
17
- if __name__ == '__main__':
18
- parser = argparse.ArgumentParser()
19
- parser.add_argument("-i", "--input_folder", type=str, required=True,
20
- help="Path to the folder containing WAV files.")
21
- parser.add_argument("-o", "--output_folder", type=str, required=True,
22
- help="Output folder to store transcriptions.")
23
- parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'],
24
- help="fp16 or fp32")#还没接入
25
- cmd = parser.parse_args()
26
- execute_denoise(
27
- input_folder = cmd.input_folder,
28
- output_folder = cmd.output_folder,
29
  )
 
1
+ import os,argparse
2
+
3
+ from modelscope.pipelines import pipeline
4
+ from modelscope.utils.constant import Tasks
5
+ from tqdm import tqdm
6
+
7
+ path_denoise = 'tools/denoise-model/speech_frcrn_ans_cirm_16k'
8
+ path_denoise = path_denoise if os.path.exists(path_denoise) else "damo/speech_frcrn_ans_cirm_16k"
9
+ ans = pipeline(Tasks.acoustic_noise_suppression,model=path_denoise)
10
+ def execute_denoise(input_folder,output_folder):
11
+ os.makedirs(output_folder,exist_ok=True)
12
+ # print(input_folder)
13
+ # print(list(os.listdir(input_folder).sort()))
14
+ for name in tqdm(os.listdir(input_folder)):
15
+ ans("%s/%s"%(input_folder,name),output_path='%s/%s'%(output_folder,name))
16
+
17
+ if __name__ == '__main__':
18
+ parser = argparse.ArgumentParser()
19
+ parser.add_argument("-i", "--input_folder", type=str, required=True,
20
+ help="Path to the folder containing WAV files.")
21
+ parser.add_argument("-o", "--output_folder", type=str, required=True,
22
+ help="Output folder to store transcriptions.")
23
+ parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'],
24
+ help="fp16 or fp32")#还没接入
25
+ cmd = parser.parse_args()
26
+ execute_denoise(
27
+ input_folder = cmd.input_folder,
28
+ output_folder = cmd.output_folder,
29
  )
tools/i18n/i18n.py CHANGED
@@ -4,7 +4,7 @@ import os
4
 
5
 
6
  def load_language_list(language):
7
- with open(f"./i18n/locale/zh_CN.json", "r", encoding="utf-8") as f:
8
  language_list = json.load(f)
9
  return language_list
10
 
 
4
 
5
 
6
  def load_language_list(language):
7
+ with open(f"./i18n/locale/{language}.json", "r", encoding="utf-8") as f:
8
  language_list = json.load(f)
9
  return language_list
10
 
tools/my_utils.py CHANGED
@@ -1,31 +1,31 @@
1
- import platform,os,traceback
2
- import ffmpeg
3
- import numpy as np
4
-
5
-
6
- def load_audio(file, sr):
7
- try:
8
- # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
9
- # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
10
- # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
11
- file = clean_path(file) # 防止小白拷路径头尾带了空格和"和回车
12
- if os.path.exists(file) == False:
13
- raise RuntimeError(
14
- "You input a wrong audio path that does not exists, please fix it!"
15
- )
16
- out, _ = (
17
- ffmpeg.input(file, threads=0)
18
- .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
19
- .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
20
- )
21
- except Exception as e:
22
- traceback.print_exc()
23
- raise RuntimeError(f"Failed to load audio: {e}")
24
-
25
- return np.frombuffer(out, np.float32).flatten()
26
-
27
-
28
- def clean_path(path_str):
29
- if platform.system() == 'Windows':
30
- path_str = path_str.replace('/', '\\')
31
- return path_str.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
 
1
+ import platform,os,traceback
2
+ import ffmpeg
3
+ import numpy as np
4
+
5
+
6
+ def load_audio(file, sr):
7
+ try:
8
+ # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
9
+ # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
10
+ # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
11
+ file = clean_path(file) # 防止小白拷路径头尾带了空格和"和回车
12
+ if os.path.exists(file) == False:
13
+ raise RuntimeError(
14
+ "You input a wrong audio path that does not exists, please fix it!"
15
+ )
16
+ out, _ = (
17
+ ffmpeg.input(file, threads=0)
18
+ .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
19
+ .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
20
+ )
21
+ except Exception as e:
22
+ traceback.print_exc()
23
+ raise RuntimeError(f"Failed to load audio: {e}")
24
+
25
+ return np.frombuffer(out, np.float32).flatten()
26
+
27
+
28
+ def clean_path(path_str):
29
+ if platform.system() == 'Windows':
30
+ path_str = path_str.replace('/', '\\')
31
+ return path_str.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
tools/slice_audio.py CHANGED
@@ -1,48 +1,48 @@
1
- import os,sys,numpy as np
2
- import traceback
3
- from scipy.io import wavfile
4
- # parent_directory = os.path.dirname(os.path.abspath(__file__))
5
- # sys.path.append(parent_directory)
6
- from my_utils import load_audio
7
- from slicer2 import Slicer
8
-
9
- def slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,i_part,all_part):
10
- os.makedirs(opt_root,exist_ok=True)
11
- if os.path.isfile(inp):
12
- input=[inp]
13
- elif os.path.isdir(inp):
14
- input=[os.path.join(inp, name) for name in sorted(list(os.listdir(inp)))]
15
- else:
16
- return "输入路径存在但既不是文件也不是文件夹"
17
- slicer = Slicer(
18
- sr=32000, # 长音频采样率
19
- threshold= int(threshold), # 音量小于这个值视作静音的备选切割点
20
- min_length= int(min_length), # 每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值
21
- min_interval= int(min_interval), # 最短切割间隔
22
- hop_size= int(hop_size), # 怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)
23
- max_sil_kept= int(max_sil_kept), # 切完后静音最多留多长
24
- )
25
- _max=float(_max)
26
- alpha=float(alpha)
27
- for inp_path in input[int(i_part)::int(all_part)]:
28
- # print(inp_path)
29
- try:
30
- name = os.path.basename(inp_path)
31
- audio = load_audio(inp_path, 32000)
32
- # print(audio.shape)
33
- for chunk, start, end in slicer.slice(audio): # start和end是帧数
34
- tmp_max = np.abs(chunk).max()
35
- if(tmp_max>1):chunk/=tmp_max
36
- chunk = (chunk / tmp_max * (_max * alpha)) + (1 - alpha) * chunk
37
- wavfile.write(
38
- "%s/%s_%010d_%010d.wav" % (opt_root, name, start, end),
39
- 32000,
40
- # chunk.astype(np.float32),
41
- (chunk * 32767).astype(np.int16),
42
- )
43
- except:
44
- print(inp_path,"->fail->",traceback.format_exc())
45
- return "执行完毕,请检查输出文件"
46
-
47
- print(slice(*sys.argv[1:]))
48
-
 
1
+ import os,sys,numpy as np
2
+ import traceback
3
+ from scipy.io import wavfile
4
+ # parent_directory = os.path.dirname(os.path.abspath(__file__))
5
+ # sys.path.append(parent_directory)
6
+ from my_utils import load_audio
7
+ from slicer2 import Slicer
8
+
9
+ def slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,i_part,all_part):
10
+ os.makedirs(opt_root,exist_ok=True)
11
+ if os.path.isfile(inp):
12
+ input=[inp]
13
+ elif os.path.isdir(inp):
14
+ input=[os.path.join(inp, name) for name in sorted(list(os.listdir(inp)))]
15
+ else:
16
+ return "输入路径存在但既不是文件也不是文件夹"
17
+ slicer = Slicer(
18
+ sr=32000, # 长音频采样率
19
+ threshold= int(threshold), # 音量小于这个值视作静音的备选切割点
20
+ min_length= int(min_length), # 每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值
21
+ min_interval= int(min_interval), # 最短切割间隔
22
+ hop_size= int(hop_size), # 怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)
23
+ max_sil_kept= int(max_sil_kept), # 切完后静音最多留多长
24
+ )
25
+ _max=float(_max)
26
+ alpha=float(alpha)
27
+ for inp_path in input[int(i_part)::int(all_part)]:
28
+ # print(inp_path)
29
+ try:
30
+ name = os.path.basename(inp_path)
31
+ audio = load_audio(inp_path, 32000)
32
+ # print(audio.shape)
33
+ for chunk, start, end in slicer.slice(audio): # start和end是帧数
34
+ tmp_max = np.abs(chunk).max()
35
+ if(tmp_max>1):chunk/=tmp_max
36
+ chunk = (chunk / tmp_max * (_max * alpha)) + (1 - alpha) * chunk
37
+ wavfile.write(
38
+ "%s/%s_%010d_%010d.wav" % (opt_root, name, start, end),
39
+ 32000,
40
+ # chunk.astype(np.float32),
41
+ (chunk * 32767).astype(np.int16),
42
+ )
43
+ except:
44
+ print(inp_path,"->fail->",traceback.format_exc())
45
+ return "执行完毕,请检查输出文件"
46
+
47
+ print(slice(*sys.argv[1:]))
48
+
tools/slicer2.py CHANGED
@@ -1,261 +1,261 @@
1
- import numpy as np
2
-
3
-
4
- # This function is obtained from librosa.
5
- def get_rms(
6
- y,
7
- frame_length=2048,
8
- hop_length=512,
9
- pad_mode="constant",
10
- ):
11
- padding = (int(frame_length // 2), int(frame_length // 2))
12
- y = np.pad(y, padding, mode=pad_mode)
13
-
14
- axis = -1
15
- # put our new within-frame axis at the end for now
16
- out_strides = y.strides + tuple([y.strides[axis]])
17
- # Reduce the shape on the framing axis
18
- x_shape_trimmed = list(y.shape)
19
- x_shape_trimmed[axis] -= frame_length - 1
20
- out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
21
- xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides)
22
- if axis < 0:
23
- target_axis = axis - 1
24
- else:
25
- target_axis = axis + 1
26
- xw = np.moveaxis(xw, -1, target_axis)
27
- # Downsample along the target axis
28
- slices = [slice(None)] * xw.ndim
29
- slices[axis] = slice(0, None, hop_length)
30
- x = xw[tuple(slices)]
31
-
32
- # Calculate power
33
- power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True)
34
-
35
- return np.sqrt(power)
36
-
37
-
38
- class Slicer:
39
- def __init__(
40
- self,
41
- sr: int,
42
- threshold: float = -40.0,
43
- min_length: int = 5000,
44
- min_interval: int = 300,
45
- hop_size: int = 20,
46
- max_sil_kept: int = 5000,
47
- ):
48
- if not min_length >= min_interval >= hop_size:
49
- raise ValueError(
50
- "The following condition must be satisfied: min_length >= min_interval >= hop_size"
51
- )
52
- if not max_sil_kept >= hop_size:
53
- raise ValueError(
54
- "The following condition must be satisfied: max_sil_kept >= hop_size"
55
- )
56
- min_interval = sr * min_interval / 1000
57
- self.threshold = 10 ** (threshold / 20.0)
58
- self.hop_size = round(sr * hop_size / 1000)
59
- self.win_size = min(round(min_interval), 4 * self.hop_size)
60
- self.min_length = round(sr * min_length / 1000 / self.hop_size)
61
- self.min_interval = round(min_interval / self.hop_size)
62
- self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
63
-
64
- def _apply_slice(self, waveform, begin, end):
65
- if len(waveform.shape) > 1:
66
- return waveform[
67
- :, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size)
68
- ]
69
- else:
70
- return waveform[
71
- begin * self.hop_size : min(waveform.shape[0], end * self.hop_size)
72
- ]
73
-
74
- # @timeit
75
- def slice(self, waveform):
76
- if len(waveform.shape) > 1:
77
- samples = waveform.mean(axis=0)
78
- else:
79
- samples = waveform
80
- if samples.shape[0] <= self.min_length:
81
- return [waveform]
82
- rms_list = get_rms(
83
- y=samples, frame_length=self.win_size, hop_length=self.hop_size
84
- ).squeeze(0)
85
- sil_tags = []
86
- silence_start = None
87
- clip_start = 0
88
- for i, rms in enumerate(rms_list):
89
- # Keep looping while frame is silent.
90
- if rms < self.threshold:
91
- # Record start of silent frames.
92
- if silence_start is None:
93
- silence_start = i
94
- continue
95
- # Keep looping while frame is not silent and silence start has not been recorded.
96
- if silence_start is None:
97
- continue
98
- # Clear recorded silence start if interval is not enough or clip is too short
99
- is_leading_silence = silence_start == 0 and i > self.max_sil_kept
100
- need_slice_middle = (
101
- i - silence_start >= self.min_interval
102
- and i - clip_start >= self.min_length
103
- )
104
- if not is_leading_silence and not need_slice_middle:
105
- silence_start = None
106
- continue
107
- # Need slicing. Record the range of silent frames to be removed.
108
- if i - silence_start <= self.max_sil_kept:
109
- pos = rms_list[silence_start : i + 1].argmin() + silence_start
110
- if silence_start == 0:
111
- sil_tags.append((0, pos))
112
- else:
113
- sil_tags.append((pos, pos))
114
- clip_start = pos
115
- elif i - silence_start <= self.max_sil_kept * 2:
116
- pos = rms_list[
117
- i - self.max_sil_kept : silence_start + self.max_sil_kept + 1
118
- ].argmin()
119
- pos += i - self.max_sil_kept
120
- pos_l = (
121
- rms_list[
122
- silence_start : silence_start + self.max_sil_kept + 1
123
- ].argmin()
124
- + silence_start
125
- )
126
- pos_r = (
127
- rms_list[i - self.max_sil_kept : i + 1].argmin()
128
- + i
129
- - self.max_sil_kept
130
- )
131
- if silence_start == 0:
132
- sil_tags.append((0, pos_r))
133
- clip_start = pos_r
134
- else:
135
- sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
136
- clip_start = max(pos_r, pos)
137
- else:
138
- pos_l = (
139
- rms_list[
140
- silence_start : silence_start + self.max_sil_kept + 1
141
- ].argmin()
142
- + silence_start
143
- )
144
- pos_r = (
145
- rms_list[i - self.max_sil_kept : i + 1].argmin()
146
- + i
147
- - self.max_sil_kept
148
- )
149
- if silence_start == 0:
150
- sil_tags.append((0, pos_r))
151
- else:
152
- sil_tags.append((pos_l, pos_r))
153
- clip_start = pos_r
154
- silence_start = None
155
- # Deal with trailing silence.
156
- total_frames = rms_list.shape[0]
157
- if (
158
- silence_start is not None
159
- and total_frames - silence_start >= self.min_interval
160
- ):
161
- silence_end = min(total_frames, silence_start + self.max_sil_kept)
162
- pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start
163
- sil_tags.append((pos, total_frames + 1))
164
- # Apply and return slices.
165
- ####音频+起始时间+终止时间
166
- if len(sil_tags) == 0:
167
- return [[waveform,0,int(total_frames*self.hop_size)]]
168
- else:
169
- chunks = []
170
- if sil_tags[0][0] > 0:
171
- chunks.append([self._apply_slice(waveform, 0, sil_tags[0][0]),0,int(sil_tags[0][0]*self.hop_size)])
172
- for i in range(len(sil_tags) - 1):
173
- chunks.append(
174
- [self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]),int(sil_tags[i][1]*self.hop_size),int(sil_tags[i + 1][0]*self.hop_size)]
175
- )
176
- if sil_tags[-1][1] < total_frames:
177
- chunks.append(
178
- [self._apply_slice(waveform, sil_tags[-1][1], total_frames),int(sil_tags[-1][1]*self.hop_size),int(total_frames*self.hop_size)]
179
- )
180
- return chunks
181
-
182
-
183
- def main():
184
- import os.path
185
- from argparse import ArgumentParser
186
-
187
- import librosa
188
- import soundfile
189
-
190
- parser = ArgumentParser()
191
- parser.add_argument("audio", type=str, help="The audio to be sliced")
192
- parser.add_argument(
193
- "--out", type=str, help="Output directory of the sliced audio clips"
194
- )
195
- parser.add_argument(
196
- "--db_thresh",
197
- type=float,
198
- required=False,
199
- default=-40,
200
- help="The dB threshold for silence detection",
201
- )
202
- parser.add_argument(
203
- "--min_length",
204
- type=int,
205
- required=False,
206
- default=5000,
207
- help="The minimum milliseconds required for each sliced audio clip",
208
- )
209
- parser.add_argument(
210
- "--min_interval",
211
- type=int,
212
- required=False,
213
- default=300,
214
- help="The minimum milliseconds for a silence part to be sliced",
215
- )
216
- parser.add_argument(
217
- "--hop_size",
218
- type=int,
219
- required=False,
220
- default=10,
221
- help="Frame length in milliseconds",
222
- )
223
- parser.add_argument(
224
- "--max_sil_kept",
225
- type=int,
226
- required=False,
227
- default=500,
228
- help="The maximum silence length kept around the sliced clip, presented in milliseconds",
229
- )
230
- args = parser.parse_args()
231
- out = args.out
232
- if out is None:
233
- out = os.path.dirname(os.path.abspath(args.audio))
234
- audio, sr = librosa.load(args.audio, sr=None, mono=False)
235
- slicer = Slicer(
236
- sr=sr,
237
- threshold=args.db_thresh,
238
- min_length=args.min_length,
239
- min_interval=args.min_interval,
240
- hop_size=args.hop_size,
241
- max_sil_kept=args.max_sil_kept,
242
- )
243
- chunks = slicer.slice(audio)
244
- if not os.path.exists(out):
245
- os.makedirs(out)
246
- for i, chunk in enumerate(chunks):
247
- if len(chunk.shape) > 1:
248
- chunk = chunk.T
249
- soundfile.write(
250
- os.path.join(
251
- out,
252
- f"%s_%d.wav"
253
- % (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i),
254
- ),
255
- chunk,
256
- sr,
257
- )
258
-
259
-
260
- if __name__ == "__main__":
261
- main()
 
1
+ import numpy as np
2
+
3
+
4
+ # This function is obtained from librosa.
5
+ def get_rms(
6
+ y,
7
+ frame_length=2048,
8
+ hop_length=512,
9
+ pad_mode="constant",
10
+ ):
11
+ padding = (int(frame_length // 2), int(frame_length // 2))
12
+ y = np.pad(y, padding, mode=pad_mode)
13
+
14
+ axis = -1
15
+ # put our new within-frame axis at the end for now
16
+ out_strides = y.strides + tuple([y.strides[axis]])
17
+ # Reduce the shape on the framing axis
18
+ x_shape_trimmed = list(y.shape)
19
+ x_shape_trimmed[axis] -= frame_length - 1
20
+ out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
21
+ xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides)
22
+ if axis < 0:
23
+ target_axis = axis - 1
24
+ else:
25
+ target_axis = axis + 1
26
+ xw = np.moveaxis(xw, -1, target_axis)
27
+ # Downsample along the target axis
28
+ slices = [slice(None)] * xw.ndim
29
+ slices[axis] = slice(0, None, hop_length)
30
+ x = xw[tuple(slices)]
31
+
32
+ # Calculate power
33
+ power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True)
34
+
35
+ return np.sqrt(power)
36
+
37
+
38
+ class Slicer:
39
+ def __init__(
40
+ self,
41
+ sr: int,
42
+ threshold: float = -40.0,
43
+ min_length: int = 5000,
44
+ min_interval: int = 300,
45
+ hop_size: int = 20,
46
+ max_sil_kept: int = 5000,
47
+ ):
48
+ if not min_length >= min_interval >= hop_size:
49
+ raise ValueError(
50
+ "The following condition must be satisfied: min_length >= min_interval >= hop_size"
51
+ )
52
+ if not max_sil_kept >= hop_size:
53
+ raise ValueError(
54
+ "The following condition must be satisfied: max_sil_kept >= hop_size"
55
+ )
56
+ min_interval = sr * min_interval / 1000
57
+ self.threshold = 10 ** (threshold / 20.0)
58
+ self.hop_size = round(sr * hop_size / 1000)
59
+ self.win_size = min(round(min_interval), 4 * self.hop_size)
60
+ self.min_length = round(sr * min_length / 1000 / self.hop_size)
61
+ self.min_interval = round(min_interval / self.hop_size)
62
+ self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
63
+
64
+ def _apply_slice(self, waveform, begin, end):
65
+ if len(waveform.shape) > 1:
66
+ return waveform[
67
+ :, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size)
68
+ ]
69
+ else:
70
+ return waveform[
71
+ begin * self.hop_size : min(waveform.shape[0], end * self.hop_size)
72
+ ]
73
+
74
+ # @timeit
75
+ def slice(self, waveform):
76
+ if len(waveform.shape) > 1:
77
+ samples = waveform.mean(axis=0)
78
+ else:
79
+ samples = waveform
80
+ if samples.shape[0] <= self.min_length:
81
+ return [waveform]
82
+ rms_list = get_rms(
83
+ y=samples, frame_length=self.win_size, hop_length=self.hop_size
84
+ ).squeeze(0)
85
+ sil_tags = []
86
+ silence_start = None
87
+ clip_start = 0
88
+ for i, rms in enumerate(rms_list):
89
+ # Keep looping while frame is silent.
90
+ if rms < self.threshold:
91
+ # Record start of silent frames.
92
+ if silence_start is None:
93
+ silence_start = i
94
+ continue
95
+ # Keep looping while frame is not silent and silence start has not been recorded.
96
+ if silence_start is None:
97
+ continue
98
+ # Clear recorded silence start if interval is not enough or clip is too short
99
+ is_leading_silence = silence_start == 0 and i > self.max_sil_kept
100
+ need_slice_middle = (
101
+ i - silence_start >= self.min_interval
102
+ and i - clip_start >= self.min_length
103
+ )
104
+ if not is_leading_silence and not need_slice_middle:
105
+ silence_start = None
106
+ continue
107
+ # Need slicing. Record the range of silent frames to be removed.
108
+ if i - silence_start <= self.max_sil_kept:
109
+ pos = rms_list[silence_start : i + 1].argmin() + silence_start
110
+ if silence_start == 0:
111
+ sil_tags.append((0, pos))
112
+ else:
113
+ sil_tags.append((pos, pos))
114
+ clip_start = pos
115
+ elif i - silence_start <= self.max_sil_kept * 2:
116
+ pos = rms_list[
117
+ i - self.max_sil_kept : silence_start + self.max_sil_kept + 1
118
+ ].argmin()
119
+ pos += i - self.max_sil_kept
120
+ pos_l = (
121
+ rms_list[
122
+ silence_start : silence_start + self.max_sil_kept + 1
123
+ ].argmin()
124
+ + silence_start
125
+ )
126
+ pos_r = (
127
+ rms_list[i - self.max_sil_kept : i + 1].argmin()
128
+ + i
129
+ - self.max_sil_kept
130
+ )
131
+ if silence_start == 0:
132
+ sil_tags.append((0, pos_r))
133
+ clip_start = pos_r
134
+ else:
135
+ sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
136
+ clip_start = max(pos_r, pos)
137
+ else:
138
+ pos_l = (
139
+ rms_list[
140
+ silence_start : silence_start + self.max_sil_kept + 1
141
+ ].argmin()
142
+ + silence_start
143
+ )
144
+ pos_r = (
145
+ rms_list[i - self.max_sil_kept : i + 1].argmin()
146
+ + i
147
+ - self.max_sil_kept
148
+ )
149
+ if silence_start == 0:
150
+ sil_tags.append((0, pos_r))
151
+ else:
152
+ sil_tags.append((pos_l, pos_r))
153
+ clip_start = pos_r
154
+ silence_start = None
155
+ # Deal with trailing silence.
156
+ total_frames = rms_list.shape[0]
157
+ if (
158
+ silence_start is not None
159
+ and total_frames - silence_start >= self.min_interval
160
+ ):
161
+ silence_end = min(total_frames, silence_start + self.max_sil_kept)
162
+ pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start
163
+ sil_tags.append((pos, total_frames + 1))
164
+ # Apply and return slices.
165
+ ####音频+起始时间+终止时间
166
+ if len(sil_tags) == 0:
167
+ return [[waveform,0,int(total_frames*self.hop_size)]]
168
+ else:
169
+ chunks = []
170
+ if sil_tags[0][0] > 0:
171
+ chunks.append([self._apply_slice(waveform, 0, sil_tags[0][0]),0,int(sil_tags[0][0]*self.hop_size)])
172
+ for i in range(len(sil_tags) - 1):
173
+ chunks.append(
174
+ [self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]),int(sil_tags[i][1]*self.hop_size),int(sil_tags[i + 1][0]*self.hop_size)]
175
+ )
176
+ if sil_tags[-1][1] < total_frames:
177
+ chunks.append(
178
+ [self._apply_slice(waveform, sil_tags[-1][1], total_frames),int(sil_tags[-1][1]*self.hop_size),int(total_frames*self.hop_size)]
179
+ )
180
+ return chunks
181
+
182
+
183
+ def main():
184
+ import os.path
185
+ from argparse import ArgumentParser
186
+
187
+ import librosa
188
+ import soundfile
189
+
190
+ parser = ArgumentParser()
191
+ parser.add_argument("audio", type=str, help="The audio to be sliced")
192
+ parser.add_argument(
193
+ "--out", type=str, help="Output directory of the sliced audio clips"
194
+ )
195
+ parser.add_argument(
196
+ "--db_thresh",
197
+ type=float,
198
+ required=False,
199
+ default=-40,
200
+ help="The dB threshold for silence detection",
201
+ )
202
+ parser.add_argument(
203
+ "--min_length",
204
+ type=int,
205
+ required=False,
206
+ default=5000,
207
+ help="The minimum milliseconds required for each sliced audio clip",
208
+ )
209
+ parser.add_argument(
210
+ "--min_interval",
211
+ type=int,
212
+ required=False,
213
+ default=300,
214
+ help="The minimum milliseconds for a silence part to be sliced",
215
+ )
216
+ parser.add_argument(
217
+ "--hop_size",
218
+ type=int,
219
+ required=False,
220
+ default=10,
221
+ help="Frame length in milliseconds",
222
+ )
223
+ parser.add_argument(
224
+ "--max_sil_kept",
225
+ type=int,
226
+ required=False,
227
+ default=500,
228
+ help="The maximum silence length kept around the sliced clip, presented in milliseconds",
229
+ )
230
+ args = parser.parse_args()
231
+ out = args.out
232
+ if out is None:
233
+ out = os.path.dirname(os.path.abspath(args.audio))
234
+ audio, sr = librosa.load(args.audio, sr=None, mono=False)
235
+ slicer = Slicer(
236
+ sr=sr,
237
+ threshold=args.db_thresh,
238
+ min_length=args.min_length,
239
+ min_interval=args.min_interval,
240
+ hop_size=args.hop_size,
241
+ max_sil_kept=args.max_sil_kept,
242
+ )
243
+ chunks = slicer.slice(audio)
244
+ if not os.path.exists(out):
245
+ os.makedirs(out)
246
+ for i, chunk in enumerate(chunks):
247
+ if len(chunk.shape) > 1:
248
+ chunk = chunk.T
249
+ soundfile.write(
250
+ os.path.join(
251
+ out,
252
+ f"%s_%d.wav"
253
+ % (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i),
254
+ ),
255
+ chunk,
256
+ sr,
257
+ )
258
+
259
+
260
+ if __name__ == "__main__":
261
+ main()
tools/subfix_webui.py CHANGED
@@ -493,6 +493,6 @@ if __name__ == "__main__":
493
  server_name="0.0.0.0",
494
  inbrowser=True,
495
  quiet=True,
496
- share=True,
497
  server_port=int(args.webui_port_subfix)
498
- )
 
493
  server_name="0.0.0.0",
494
  inbrowser=True,
495
  quiet=True,
496
+ share=eval(args.is_share),
497
  server_port=int(args.webui_port_subfix)
498
+ )
tools/uvr5/lib/lib_v5/modelparams/4band_v3.json CHANGED
@@ -1,54 +1,54 @@
1
- {
2
- "bins": 672,
3
- "unstable_bins": 8,
4
- "reduction_bins": 530,
5
- "band": {
6
- "1": {
7
- "sr": 7350,
8
- "hl": 80,
9
- "n_fft": 640,
10
- "crop_start": 0,
11
- "crop_stop": 85,
12
- "lpf_start": 25,
13
- "lpf_stop": 53,
14
- "res_type": "polyphase"
15
- },
16
- "2": {
17
- "sr": 7350,
18
- "hl": 80,
19
- "n_fft": 320,
20
- "crop_start": 4,
21
- "crop_stop": 87,
22
- "hpf_start": 25,
23
- "hpf_stop": 12,
24
- "lpf_start": 31,
25
- "lpf_stop": 62,
26
- "res_type": "polyphase"
27
- },
28
- "3": {
29
- "sr": 14700,
30
- "hl": 160,
31
- "n_fft": 512,
32
- "crop_start": 17,
33
- "crop_stop": 216,
34
- "hpf_start": 48,
35
- "hpf_stop": 24,
36
- "lpf_start": 139,
37
- "lpf_stop": 210,
38
- "res_type": "polyphase"
39
- },
40
- "4": {
41
- "sr": 44100,
42
- "hl": 480,
43
- "n_fft": 960,
44
- "crop_start": 78,
45
- "crop_stop": 383,
46
- "hpf_start": 130,
47
- "hpf_stop": 86,
48
- "res_type": "kaiser_fast"
49
- }
50
- },
51
- "sr": 44100,
52
- "pre_filter_start": 668,
53
- "pre_filter_stop": 672
54
  }
 
1
+ {
2
+ "bins": 672,
3
+ "unstable_bins": 8,
4
+ "reduction_bins": 530,
5
+ "band": {
6
+ "1": {
7
+ "sr": 7350,
8
+ "hl": 80,
9
+ "n_fft": 640,
10
+ "crop_start": 0,
11
+ "crop_stop": 85,
12
+ "lpf_start": 25,
13
+ "lpf_stop": 53,
14
+ "res_type": "polyphase"
15
+ },
16
+ "2": {
17
+ "sr": 7350,
18
+ "hl": 80,
19
+ "n_fft": 320,
20
+ "crop_start": 4,
21
+ "crop_stop": 87,
22
+ "hpf_start": 25,
23
+ "hpf_stop": 12,
24
+ "lpf_start": 31,
25
+ "lpf_stop": 62,
26
+ "res_type": "polyphase"
27
+ },
28
+ "3": {
29
+ "sr": 14700,
30
+ "hl": 160,
31
+ "n_fft": 512,
32
+ "crop_start": 17,
33
+ "crop_stop": 216,
34
+ "hpf_start": 48,
35
+ "hpf_stop": 24,
36
+ "lpf_start": 139,
37
+ "lpf_stop": 210,
38
+ "res_type": "polyphase"
39
+ },
40
+ "4": {
41
+ "sr": 44100,
42
+ "hl": 480,
43
+ "n_fft": 960,
44
+ "crop_start": 78,
45
+ "crop_stop": 383,
46
+ "hpf_start": 130,
47
+ "hpf_stop": 86,
48
+ "res_type": "kaiser_fast"
49
+ }
50
+ },
51
+ "sr": 44100,
52
+ "pre_filter_start": 668,
53
+ "pre_filter_stop": 672
54
  }
tools/uvr5/webui.py CHANGED
@@ -73,8 +73,7 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format
73
  os.path.basename(inp_path),
74
  )
75
  os.system(
76
- "ffmpeg -i %s -vn -acodec pcm_s16le -ac 2 -ar 44100 %s -y"
77
- % (inp_path, tmp_path)
78
  )
79
  inp_path = tmp_path
80
  try:
 
73
  os.path.basename(inp_path),
74
  )
75
  os.system(
76
+ f'ffmpeg -i "{inp_path}" -vn -acodec pcm_s16le -ac 2 -ar 44100 "{tmp_path}" -y'
 
77
  )
78
  inp_path = tmp_path
79
  try:
webui.py CHANGED
@@ -1,878 +1,878 @@
1
- import os,shutil,sys,pdb,re
2
- now_dir = os.getcwd()
3
- sys.path.insert(0, now_dir)
4
- import json,yaml,warnings,torch
5
- import platform
6
- import psutil
7
- import signal
8
-
9
- warnings.filterwarnings("ignore")
10
- torch.manual_seed(233333)
11
- tmp = os.path.join(now_dir, "TEMP")
12
- os.makedirs(tmp, exist_ok=True)
13
- os.environ["TEMP"] = tmp
14
- if(os.path.exists(tmp)):
15
- for name in os.listdir(tmp):
16
- if(name=="jieba.cache"):continue
17
- path="%s/%s"%(tmp,name)
18
- delete=os.remove if os.path.isfile(path) else shutil.rmtree
19
- try:
20
- delete(path)
21
- except Exception as e:
22
- print(str(e))
23
- pass
24
- import site
25
- site_packages_roots = []
26
- for path in site.getsitepackages():
27
- if "packages" in path:
28
- site_packages_roots.append(path)
29
- if(site_packages_roots==[]):site_packages_roots=["%s/runtime/Lib/site-packages" % now_dir]
30
- #os.environ["OPENBLAS_NUM_THREADS"] = "4"
31
- os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1"
32
- os.environ["all_proxy"] = ""
33
- for site_packages_root in site_packages_roots:
34
- if os.path.exists(site_packages_root):
35
- try:
36
- with open("%s/users.pth" % (site_packages_root), "w") as f:
37
- f.write(
38
- "%s\n%s/tools\n%s/tools/damo_asr\n%s/GPT_SoVITS\n%s/tools/uvr5"
39
- % (now_dir, now_dir, now_dir, now_dir, now_dir)
40
- )
41
- break
42
- except PermissionError:
43
- pass
44
- from tools import my_utils
45
- import traceback
46
- import shutil
47
- import pdb
48
- import gradio as gr
49
- from subprocess import Popen
50
- import signal
51
- from config import python_exec,infer_device,is_half,exp_root,webui_port_main,webui_port_infer_tts,webui_port_uvr5,webui_port_subfix,is_share
52
- from tools.i18n.i18n import I18nAuto
53
- i18n = I18nAuto()
54
- from scipy.io import wavfile
55
- from tools.my_utils import load_audio
56
- from multiprocessing import cpu_count
57
-
58
- # os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 当遇到mps不支持的步骤时使用cpu
59
-
60
- n_cpu=cpu_count()
61
-
62
- ngpu = torch.cuda.device_count()
63
- gpu_infos = []
64
- mem = []
65
- if_gpu_ok = False
66
-
67
- # 判断是否有能用来训练和加速推理的N卡
68
- if torch.cuda.is_available() or ngpu != 0:
69
- for i in range(ngpu):
70
- gpu_name = torch.cuda.get_device_name(i)
71
- if any(value in gpu_name.upper()for value in ["10","16","20","30","40","A2","A3","A4","P4","A50","500","A60","70","80","90","M4","T4","TITAN","L4","4060"]):
72
- # A10#A100#V100#A40#P40#M40#K80#A4500
73
- if_gpu_ok = True # 至少有一张能用的N卡
74
- gpu_infos.append("%s\t%s" % (i, gpu_name))
75
- mem.append(int(torch.cuda.get_device_properties(i).total_memory/ 1024/ 1024/ 1024+ 0.4))
76
- # # 判断是否支持mps加速
77
- # if torch.backends.mps.is_available():
78
- # if_gpu_ok = True
79
- # gpu_infos.append("%s\t%s" % ("0", "Apple GPU"))
80
- # mem.append(psutil.virtual_memory().total/ 1024 / 1024 / 1024) # 实测使用系统内存作为显存不会爆显存
81
-
82
- if if_gpu_ok and len(gpu_infos) > 0:
83
- gpu_info = "\n".join(gpu_infos)
84
- default_batch_size = min(mem) // 2
85
- else:
86
- gpu_info = ("%s\t%s" % ("0", "CPU"))
87
- gpu_infos.append("%s\t%s" % ("0", "CPU"))
88
- default_batch_size = psutil.virtual_memory().total/ 1024 / 1024 / 1024 / 2
89
- gpus = "-".join([i[0] for i in gpu_infos])
90
-
91
- pretrained_sovits_name="GPT_SoVITS/pretrained_models/s2G488k.pth"
92
- pretrained_gpt_name="GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
93
- def get_weights_names():
94
- SoVITS_names = [pretrained_sovits_name]
95
- for name in os.listdir(SoVITS_weight_root):
96
- if name.endswith(".pth"):SoVITS_names.append(name)
97
- GPT_names = [pretrained_gpt_name]
98
- for name in os.listdir(GPT_weight_root):
99
- if name.endswith(".ckpt"): GPT_names.append(name)
100
- return SoVITS_names,GPT_names
101
- SoVITS_weight_root="SoVITS_weights"
102
- GPT_weight_root="GPT_weights"
103
- os.makedirs(SoVITS_weight_root,exist_ok=True)
104
- os.makedirs(GPT_weight_root,exist_ok=True)
105
- SoVITS_names,GPT_names = get_weights_names()
106
-
107
- def custom_sort_key(s):
108
- # 使用正则表达式提取字符串中的数字部分和非数字部分
109
- parts = re.split('(\d+)', s)
110
- # 将数字部分转换为整数,非数字部分保持不变
111
- parts = [int(part) if part.isdigit() else part for part in parts]
112
- return parts
113
-
114
- def change_choices():
115
- SoVITS_names, GPT_names = get_weights_names()
116
- return {"choices": sorted(SoVITS_names,key=custom_sort_key), "__type__": "update"}, {"choices": sorted(GPT_names,key=custom_sort_key), "__type__": "update"}
117
-
118
- p_label=None
119
- p_uvr5=None
120
- p_asr=None
121
- p_denoise=None
122
- p_tts_inference=None
123
-
124
- def kill_proc_tree(pid, including_parent=True):
125
- try:
126
- parent = psutil.Process(pid)
127
- except psutil.NoSuchProcess:
128
- # Process already terminated
129
- return
130
-
131
- children = parent.children(recursive=True)
132
- for child in children:
133
- try:
134
- os.kill(child.pid, signal.SIGTERM) # or signal.SIGKILL
135
- except OSError:
136
- pass
137
- if including_parent:
138
- try:
139
- os.kill(parent.pid, signal.SIGTERM) # or signal.SIGKILL
140
- except OSError:
141
- pass
142
-
143
- system=platform.system()
144
- def kill_process(pid):
145
- if(system=="Windows"):
146
- cmd = "taskkill /t /f /pid %s" % pid
147
- os.system(cmd)
148
- else:
149
- kill_proc_tree(pid)
150
-
151
-
152
- def change_label(if_label,path_list):
153
- global p_label
154
- if(if_label==True and p_label==None):
155
- path_list=my_utils.clean_path(path_list)
156
- cmd = '"%s" tools/subfix_webui.py --load_list "%s" --webui_port %s --is_share %s'%(python_exec,path_list,webui_port_subfix,is_share)
157
- yield i18n("打标工具WebUI已开启")
158
- print(cmd)
159
- p_label = Popen(cmd, shell=True)
160
- elif(if_label==False and p_label!=None):
161
- kill_process(p_label.pid)
162
- p_label=None
163
- yield i18n("打标工具WebUI已关闭")
164
-
165
- def change_uvr5(if_uvr5):
166
- global p_uvr5
167
- if(if_uvr5==True and p_uvr5==None):
168
- cmd = '"%s" tools/uvr5/webui.py "%s" %s %s %s'%(python_exec,infer_device,is_half,webui_port_uvr5,is_share)
169
- yield i18n("UVR5已开启")
170
- print(cmd)
171
- p_uvr5 = Popen(cmd, shell=True)
172
- elif(if_uvr5==False and p_uvr5!=None):
173
- kill_process(p_uvr5.pid)
174
- p_uvr5=None
175
- yield i18n("UVR5已关闭")
176
-
177
- def change_tts_inference(if_tts,bert_path,cnhubert_base_path,gpu_number,gpt_path,sovits_path):
178
- global p_tts_inference
179
- if(if_tts==True and p_tts_inference==None):
180
- os.environ["gpt_path"]=gpt_path if "/" in gpt_path else "%s/%s"%(GPT_weight_root,gpt_path)
181
- os.environ["sovits_path"]=sovits_path if "/"in sovits_path else "%s/%s"%(SoVITS_weight_root,sovits_path)
182
- os.environ["cnhubert_base_path"]=cnhubert_base_path
183
- os.environ["bert_path"]=bert_path
184
- os.environ["_CUDA_VISIBLE_DEVICES"]=gpu_number
185
- os.environ["is_half"]=str(is_half)
186
- os.environ["infer_ttswebui"]=str(webui_port_infer_tts)
187
- os.environ["is_share"]=str(is_share)
188
- cmd = '"%s" GPT_SoVITS/inference_webui.py'%(python_exec)
189
- yield i18n("TTS推理进程已开启")
190
- print(cmd)
191
- p_tts_inference = Popen(cmd, shell=True)
192
- elif(if_tts==False and p_tts_inference!=None):
193
- kill_process(p_tts_inference.pid)
194
- p_tts_inference=None
195
- yield i18n("TTS推理进程已关闭")
196
-
197
- from tools.asr.config import asr_dict
198
- def open_asr(asr_inp_dir, asr_opt_dir, asr_model, asr_model_size, asr_lang):
199
- global p_asr
200
- if(p_asr==None):
201
- asr_inp_dir=my_utils.clean_path(asr_inp_dir)
202
- cmd = f'"{python_exec}" tools/asr/{asr_dict[asr_model]["path"]}'
203
- cmd += f' -i "{asr_inp_dir}"'
204
- cmd += f' -o "{asr_opt_dir}"'
205
- cmd += f' -s {asr_model_size}'
206
- cmd += f' -l {asr_lang}'
207
- cmd += " -p %s"%("float16"if is_half==True else "float32")
208
-
209
- yield "ASR任务开启:%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True}
210
- print(cmd)
211
- p_asr = Popen(cmd, shell=True)
212
- p_asr.wait()
213
- p_asr=None
214
- yield f"ASR任务完成, 查看终端进行下一步",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
215
- else:
216
- yield "已有正在进行的ASR任务,需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True}
217
- # return None
218
-
219
- def close_asr():
220
- global p_asr
221
- if(p_asr!=None):
222
- kill_process(p_asr.pid)
223
- p_asr=None
224
- return "已终止ASR进程",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
225
- def open_denoise(denoise_inp_dir, denoise_opt_dir):
226
- global p_denoise
227
- if(p_denoise==None):
228
- denoise_inp_dir=my_utils.clean_path(denoise_inp_dir)
229
- denoise_opt_dir=my_utils.clean_path(denoise_opt_dir)
230
- cmd = '"%s" tools/cmd-denoise.py -i "%s" -o "%s" -p %s'%(python_exec,denoise_inp_dir,denoise_opt_dir,"float16"if is_half==True else "float32")
231
-
232
- yield "语音降噪任务开启:%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True}
233
- print(cmd)
234
- p_denoise = Popen(cmd, shell=True)
235
- p_denoise.wait()
236
- p_denoise=None
237
- yield f"语音降噪任务完成, 查看终端进行下一步",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
238
- else:
239
- yield "已有正在进行的语音降噪任务,需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True}
240
- # return None
241
-
242
- def close_denoise():
243
- global p_denoise
244
- if(p_denoise!=None):
245
- kill_process(p_denoise.pid)
246
- p_denoise=None
247
- return "已终止语音降噪进程",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
248
-
249
- p_train_SoVITS=None
250
- def open1Ba(batch_size,total_epoch,exp_name,text_low_lr_rate,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers1Ba,pretrained_s2G,pretrained_s2D):
251
- global p_train_SoVITS
252
- if(p_train_SoVITS==None):
253
- with open("GPT_SoVITS/configs/s2.json")as f:
254
- data=f.read()
255
- data=json.loads(data)
256
- s2_dir="%s/%s"%(exp_root,exp_name)
257
- os.makedirs("%s/logs_s2"%(s2_dir),exist_ok=True)
258
- if(is_half==False):
259
- data["train"]["fp16_run"]=False
260
- batch_size=max(1,batch_size//2)
261
- data["train"]["batch_size"]=batch_size
262
- data["train"]["epochs"]=total_epoch
263
- data["train"]["text_low_lr_rate"]=text_low_lr_rate
264
- data["train"]["pretrained_s2G"]=pretrained_s2G
265
- data["train"]["pretrained_s2D"]=pretrained_s2D
266
- data["train"]["if_save_latest"]=if_save_latest
267
- data["train"]["if_save_every_weights"]=if_save_every_weights
268
- data["train"]["save_every_epoch"]=save_every_epoch
269
- data["train"]["gpu_numbers"]=gpu_numbers1Ba
270
- data["data"]["exp_dir"]=data["s2_ckpt_dir"]=s2_dir
271
- data["save_weight_dir"]=SoVITS_weight_root
272
- data["name"]=exp_name
273
- tmp_config_path="%s/tmp_s2.json"%tmp
274
- with open(tmp_config_path,"w")as f:f.write(json.dumps(data))
275
-
276
- cmd = '"%s" GPT_SoVITS/s2_train.py --config "%s"'%(python_exec,tmp_config_path)
277
- yield "SoVITS训练开始:%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True}
278
- print(cmd)
279
- p_train_SoVITS = Popen(cmd, shell=True)
280
- p_train_SoVITS.wait()
281
- p_train_SoVITS=None
282
- yield "SoVITS训练完成",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
283
- else:
284
- yield "已有正在进行的SoVITS训练任务,需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True}
285
-
286
- def close1Ba():
287
- global p_train_SoVITS
288
- if(p_train_SoVITS!=None):
289
- kill_process(p_train_SoVITS.pid)
290
- p_train_SoVITS=None
291
- return "已终止SoVITS训练",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
292
-
293
- p_train_GPT=None
294
- def open1Bb(batch_size,total_epoch,exp_name,if_dpo,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers,pretrained_s1):
295
- global p_train_GPT
296
- if(p_train_GPT==None):
297
- with open("GPT_SoVITS/configs/s1longer.yaml")as f:
298
- data=f.read()
299
- data=yaml.load(data, Loader=yaml.FullLoader)
300
- s1_dir="%s/%s"%(exp_root,exp_name)
301
- os.makedirs("%s/logs_s1"%(s1_dir),exist_ok=True)
302
- if(is_half==False):
303
- data["train"]["precision"]="32"
304
- batch_size = max(1, batch_size // 2)
305
- data["train"]["batch_size"]=batch_size
306
- data["train"]["epochs"]=total_epoch
307
- data["pretrained_s1"]=pretrained_s1
308
- data["train"]["save_every_n_epoch"]=save_every_epoch
309
- data["train"]["if_save_every_weights"]=if_save_every_weights
310
- data["train"]["if_save_latest"]=if_save_latest
311
- data["train"]["if_dpo"]=if_dpo
312
- data["train"]["half_weights_save_dir"]=GPT_weight_root
313
- data["train"]["exp_name"]=exp_name
314
- data["train_semantic_path"]="%s/6-name2semantic.tsv"%s1_dir
315
- data["train_phoneme_path"]="%s/2-name2text.txt"%s1_dir
316
- data["output_dir"]="%s/logs_s1"%s1_dir
317
-
318
- os.environ["_CUDA_VISIBLE_DEVICES"]=gpu_numbers.replace("-",",")
319
- os.environ["hz"]="25hz"
320
- tmp_config_path="%s/tmp_s1.yaml"%tmp
321
- with open(tmp_config_path, "w") as f:f.write(yaml.dump(data, default_flow_style=False))
322
- # cmd = '"%s" GPT_SoVITS/s1_train.py --config_file "%s" --train_semantic_path "%s/6-name2semantic.tsv" --train_phoneme_path "%s/2-name2text.txt" --output_dir "%s/logs_s1"'%(python_exec,tmp_config_path,s1_dir,s1_dir,s1_dir)
323
- cmd = '"%s" GPT_SoVITS/s1_train.py --config_file "%s" '%(python_exec,tmp_config_path)
324
- yield "GPT训练开始:%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True}
325
- print(cmd)
326
- p_train_GPT = Popen(cmd, shell=True)
327
- p_train_GPT.wait()
328
- p_train_GPT=None
329
- yield "GPT训练完成",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
330
- else:
331
- yield "已有正在进行的GPT训练任务,需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True}
332
-
333
- def close1Bb():
334
- global p_train_GPT
335
- if(p_train_GPT!=None):
336
- kill_process(p_train_GPT.pid)
337
- p_train_GPT=None
338
- return "已终止GPT训练",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
339
-
340
- ps_slice=[]
341
- def open_slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,n_parts):
342
- global ps_slice
343
- inp = my_utils.clean_path(inp)
344
- opt_root = my_utils.clean_path(opt_root)
345
- if(os.path.exists(inp)==False):
346
- yield "输入路径不存在",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
347
- return
348
- if os.path.isfile(inp):n_parts=1
349
- elif os.path.isdir(inp):pass
350
- else:
351
- yield "输入路径存在但既不���文件也不是文件夹",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
352
- return
353
- if (ps_slice == []):
354
- for i_part in range(n_parts):
355
- cmd = '"%s" tools/slice_audio.py "%s" "%s" %s %s %s %s %s %s %s %s %s''' % (python_exec,inp, opt_root, threshold, min_length, min_interval, hop_size, max_sil_kept, _max, alpha, i_part, n_parts)
356
- print(cmd)
357
- p = Popen(cmd, shell=True)
358
- ps_slice.append(p)
359
- yield "切割执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
360
- for p in ps_slice:
361
- p.wait()
362
- ps_slice=[]
363
- yield "切割结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
364
- else:
365
- yield "已有正在进行的切割任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
366
-
367
- def close_slice():
368
- global ps_slice
369
- if (ps_slice != []):
370
- for p_slice in ps_slice:
371
- try:
372
- kill_process(p_slice.pid)
373
- except:
374
- traceback.print_exc()
375
- ps_slice=[]
376
- return "已终止所有切割进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
377
-
378
- ps1a=[]
379
- def open1a(inp_text,inp_wav_dir,exp_name,gpu_numbers,bert_pretrained_dir):
380
- global ps1a
381
- inp_text = my_utils.clean_path(inp_text)
382
- inp_wav_dir = my_utils.clean_path(inp_wav_dir)
383
- if (ps1a == []):
384
- opt_dir="%s/%s"%(exp_root,exp_name)
385
- config={
386
- "inp_text":inp_text,
387
- "inp_wav_dir":inp_wav_dir,
388
- "exp_name":exp_name,
389
- "opt_dir":opt_dir,
390
- "bert_pretrained_dir":bert_pretrained_dir,
391
- }
392
- gpu_names=gpu_numbers.split("-")
393
- all_parts=len(gpu_names)
394
- for i_part in range(all_parts):
395
- config.update(
396
- {
397
- "i_part": str(i_part),
398
- "all_parts": str(all_parts),
399
- "_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
400
- "is_half": str(is_half)
401
- }
402
- )
403
- os.environ.update(config)
404
- cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py'%python_exec
405
- print(cmd)
406
- p = Popen(cmd, shell=True)
407
- ps1a.append(p)
408
- yield "文本进程执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
409
- for p in ps1a:
410
- p.wait()
411
- opt = []
412
- for i_part in range(all_parts):
413
- txt_path = "%s/2-name2text-%s.txt" % (opt_dir, i_part)
414
- with open(txt_path, "r", encoding="utf8") as f:
415
- opt += f.read().strip("\n").split("\n")
416
- os.remove(txt_path)
417
- path_text = "%s/2-name2text.txt" % opt_dir
418
- with open(path_text, "w", encoding="utf8") as f:
419
- f.write("\n".join(opt) + "\n")
420
- ps1a=[]
421
- yield "文本进程结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
422
- else:
423
- yield "已有正在进行的文本任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
424
-
425
- def close1a():
426
- global ps1a
427
- if (ps1a != []):
428
- for p1a in ps1a:
429
- try:
430
- kill_process(p1a.pid)
431
- except:
432
- traceback.print_exc()
433
- ps1a=[]
434
- return "已终止所有1a进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
435
-
436
- ps1b=[]
437
- def open1b(inp_text,inp_wav_dir,exp_name,gpu_numbers,ssl_pretrained_dir):
438
- global ps1b
439
- inp_text = my_utils.clean_path(inp_text)
440
- inp_wav_dir = my_utils.clean_path(inp_wav_dir)
441
- if (ps1b == []):
442
- config={
443
- "inp_text":inp_text,
444
- "inp_wav_dir":inp_wav_dir,
445
- "exp_name":exp_name,
446
- "opt_dir":"%s/%s"%(exp_root,exp_name),
447
- "cnhubert_base_dir":ssl_pretrained_dir,
448
- "is_half": str(is_half)
449
- }
450
- gpu_names=gpu_numbers.split("-")
451
- all_parts=len(gpu_names)
452
- for i_part in range(all_parts):
453
- config.update(
454
- {
455
- "i_part": str(i_part),
456
- "all_parts": str(all_parts),
457
- "_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
458
- }
459
- )
460
- os.environ.update(config)
461
- cmd = '"%s" GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py'%python_exec
462
- print(cmd)
463
- p = Popen(cmd, shell=True)
464
- ps1b.append(p)
465
- yield "SSL提取进程执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
466
- for p in ps1b:
467
- p.wait()
468
- ps1b=[]
469
- yield "SSL提取进程结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
470
- else:
471
- yield "已有正在进���的SSL提取任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
472
-
473
- def close1b():
474
- global ps1b
475
- if (ps1b != []):
476
- for p1b in ps1b:
477
- try:
478
- kill_process(p1b.pid)
479
- except:
480
- traceback.print_exc()
481
- ps1b=[]
482
- return "已终止所有1b进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
483
-
484
- ps1c=[]
485
- def open1c(inp_text,exp_name,gpu_numbers,pretrained_s2G_path):
486
- global ps1c
487
- inp_text = my_utils.clean_path(inp_text)
488
- if (ps1c == []):
489
- opt_dir="%s/%s"%(exp_root,exp_name)
490
- config={
491
- "inp_text":inp_text,
492
- "exp_name":exp_name,
493
- "opt_dir":opt_dir,
494
- "pretrained_s2G":pretrained_s2G_path,
495
- "s2config_path":"GPT_SoVITS/configs/s2.json",
496
- "is_half": str(is_half)
497
- }
498
- gpu_names=gpu_numbers.split("-")
499
- all_parts=len(gpu_names)
500
- for i_part in range(all_parts):
501
- config.update(
502
- {
503
- "i_part": str(i_part),
504
- "all_parts": str(all_parts),
505
- "_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
506
- }
507
- )
508
- os.environ.update(config)
509
- cmd = '"%s" GPT_SoVITS/prepare_datasets/3-get-semantic.py'%python_exec
510
- print(cmd)
511
- p = Popen(cmd, shell=True)
512
- ps1c.append(p)
513
- yield "语义token提取进程执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
514
- for p in ps1c:
515
- p.wait()
516
- opt = ["item_name\tsemantic_audio"]
517
- path_semantic = "%s/6-name2semantic.tsv" % opt_dir
518
- for i_part in range(all_parts):
519
- semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part)
520
- with open(semantic_path, "r", encoding="utf8") as f:
521
- opt += f.read().strip("\n").split("\n")
522
- os.remove(semantic_path)
523
- with open(path_semantic, "w", encoding="utf8") as f:
524
- f.write("\n".join(opt) + "\n")
525
- ps1c=[]
526
- yield "语义token提取进程结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
527
- else:
528
- yield "已有正在进行的语义token提取任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
529
-
530
- def close1c():
531
- global ps1c
532
- if (ps1c != []):
533
- for p1c in ps1c:
534
- try:
535
- kill_process(p1c.pid)
536
- except:
537
- traceback.print_exc()
538
- ps1c=[]
539
- return "已终止所有语义token进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
540
- #####inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numbers1c,bert_pretrained_dir,cnhubert_base_dir,pretrained_s2G
541
- ps1abc=[]
542
- def open1abc(inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numbers1c,bert_pretrained_dir,ssl_pretrained_dir,pretrained_s2G_path):
543
- global ps1abc
544
- inp_text = my_utils.clean_path(inp_text)
545
- inp_wav_dir = my_utils.clean_path(inp_wav_dir)
546
- if (ps1abc == []):
547
- opt_dir="%s/%s"%(exp_root,exp_name)
548
- try:
549
- #############################1a
550
- path_text="%s/2-name2text.txt" % opt_dir
551
- if(os.path.exists(path_text)==False or (os.path.exists(path_text)==True and len(open(path_text,"r",encoding="utf8").read().strip("\n").split("\n"))<2)):
552
- config={
553
- "inp_text":inp_text,
554
- "inp_wav_dir":inp_wav_dir,
555
- "exp_name":exp_name,
556
- "opt_dir":opt_dir,
557
- "bert_pretrained_dir":bert_pretrained_dir,
558
- "is_half": str(is_half)
559
- }
560
- gpu_names=gpu_numbers1a.split("-")
561
- all_parts=len(gpu_names)
562
- for i_part in range(all_parts):
563
- config.update(
564
- {
565
- "i_part": str(i_part),
566
- "all_parts": str(all_parts),
567
- "_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
568
- }
569
- )
570
- os.environ.update(config)
571
- cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py'%python_exec
572
- print(cmd)
573
- p = Popen(cmd, shell=True)
574
- ps1abc.append(p)
575
- yield "进度:1a-ing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
576
- for p in ps1abc:p.wait()
577
-
578
- opt = []
579
- for i_part in range(all_parts):#txt_path="%s/2-name2text-%s.txt"%(opt_dir,i_part)
580
- txt_path = "%s/2-name2text-%s.txt" % (opt_dir, i_part)
581
- with open(txt_path, "r",encoding="utf8") as f:
582
- opt += f.read().strip("\n").split("\n")
583
- os.remove(txt_path)
584
- with open(path_text, "w",encoding="utf8") as f:
585
- f.write("\n".join(opt) + "\n")
586
-
587
- yield "进度:1a-done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
588
- ps1abc=[]
589
- #############################1b
590
- config={
591
- "inp_text":inp_text,
592
- "inp_wav_dir":inp_wav_dir,
593
- "exp_name":exp_name,
594
- "opt_dir":opt_dir,
595
- "cnhubert_base_dir":ssl_pretrained_dir,
596
- }
597
- gpu_names=gpu_numbers1Ba.split("-")
598
- all_parts=len(gpu_names)
599
- for i_part in range(all_parts):
600
- config.update(
601
- {
602
- "i_part": str(i_part),
603
- "all_parts": str(all_parts),
604
- "_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
605
- }
606
- )
607
- os.environ.update(config)
608
- cmd = '"%s" GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py'%python_exec
609
- print(cmd)
610
- p = Popen(cmd, shell=True)
611
- ps1abc.append(p)
612
- yield "进度:1a-done, 1b-ing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
613
- for p in ps1abc:p.wait()
614
- yield "进度:1a1b-done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
615
- ps1abc=[]
616
- #############################1c
617
- path_semantic = "%s/6-name2semantic.tsv" % opt_dir
618
- if(os.path.exists(path_semantic)==False or (os.path.exists(path_semantic)==True and os.path.getsize(path_semantic)<31)):
619
- config={
620
- "inp_text":inp_text,
621
- "exp_name":exp_name,
622
- "opt_dir":opt_dir,
623
- "pretrained_s2G":pretrained_s2G_path,
624
- "s2config_path":"GPT_SoVITS/configs/s2.json",
625
- }
626
- gpu_names=gpu_numbers1c.split("-")
627
- all_parts=len(gpu_names)
628
- for i_part in range(all_parts):
629
- config.update(
630
- {
631
- "i_part": str(i_part),
632
- "all_parts": str(all_parts),
633
- "_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
634
- }
635
- )
636
- os.environ.update(config)
637
- cmd = '"%s" GPT_SoVITS/prepare_datasets/3-get-semantic.py'%python_exec
638
- print(cmd)
639
- p = Popen(cmd, shell=True)
640
- ps1abc.append(p)
641
- yield "进度:1a1b-done, 1cing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
642
- for p in ps1abc:p.wait()
643
-
644
- opt = ["item_name\tsemantic_audio"]
645
- for i_part in range(all_parts):
646
- semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part)
647
- with open(semantic_path, "r",encoding="utf8") as f:
648
- opt += f.read().strip("\n").split("\n")
649
- os.remove(semantic_path)
650
- with open(path_semantic, "w",encoding="utf8") as f:
651
- f.write("\n".join(opt) + "\n")
652
- yield "进度:all-done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
653
- ps1abc = []
654
- yield "一键三连进程结束", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
655
- except:
656
- traceback.print_exc()
657
- close1abc()
658
- yield "一键三连中途报错", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
659
- else:
660
- yield "已有正在进行的一键三连任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
661
-
662
- def close1abc():
663
- global ps1abc
664
- if (ps1abc != []):
665
- for p1abc in ps1abc:
666
- try:
667
- kill_process(p1abc.pid)
668
- except:
669
- traceback.print_exc()
670
- ps1abc=[]
671
- return "已终止所有一键三连进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
672
-
673
- with gr.Blocks(title="GPT-SoVITS WebUI") as app:
674
- gr.Markdown(
675
- value=
676
- i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>.")
677
- )
678
- gr.Markdown(
679
- value=
680
- i18n("中文教程文档:https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e")
681
- )
682
-
683
- with gr.Tabs():
684
- with gr.TabItem(i18n("0-前置数据集获取工具")):#提前随机切片防止uvr5爆内存->uvr5->slicer->asr->打标
685
- gr.Markdown(value=i18n("0a-UVR5人声伴奏分离&去混响去延迟工具"))
686
- with gr.Row():
687
- if_uvr5 = gr.Checkbox(label=i18n("是否开启UVR5-WebUI"),show_label=True)
688
- uvr5_info = gr.Textbox(label=i18n("UVR5进程输出信息"))
689
- gr.Markdown(value=i18n("0b-语音切分工具"))
690
- with gr.Row():
691
- with gr.Row():
692
- slice_inp_path=gr.Textbox(label=i18n("音频自动切分输入路径,可文件可文件夹"),value="")
693
- slice_opt_root=gr.Textbox(label=i18n("切分后的子音频的输出根目录"),value="output/slicer_opt")
694
- threshold=gr.Textbox(label=i18n("threshold:音量小于这个值视作静音的备选切割点"),value="-34")
695
- min_length=gr.Textbox(label=i18n("min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值"),value="4000")
696
- min_interval=gr.Textbox(label=i18n("min_interval:最短切割间隔"),value="300")
697
- hop_size=gr.Textbox(label=i18n("hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)"),value="10")
698
- max_sil_kept=gr.Textbox(label=i18n("max_sil_kept:切完后静音最多留多长"),value="500")
699
- with gr.Row():
700
- open_slicer_button=gr.Button(i18n("开启语音切割"), variant="primary",visible=True)
701
- close_slicer_button=gr.Button(i18n("终止语音切割"), variant="primary",visible=False)
702
- _max=gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("max:归一化后最大值多少"),value=0.9,interactive=True)
703
- alpha=gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("alpha_mix:混多少比例归一化后音频进来"),value=0.25,interactive=True)
704
- n_process=gr.Slider(minimum=1,maximum=n_cpu,step=1,label=i18n("切割使用的进程数"),value=4,interactive=True)
705
- slicer_info = gr.Textbox(label=i18n("语音切割进程输出信息"))
706
- gr.Markdown(value=i18n("0bb-语音降噪工具"))
707
- with gr.Row():
708
- open_denoise_button = gr.Button(i18n("开启语音降噪"), visible=True)
709
- close_denoise_button = gr.Button(i18n("终止语音降噪进程"), variant="primary",visible=False)
710
- denoise_input_dir=gr.Textbox(label=i18n("降噪音频文件输入文件夹"),value="")
711
- denoise_output_dir=gr.Textbox(label=i18n("降噪结果输出文件夹"),value="output/denoise_opt")
712
- denoise_info = gr.Textbox(label=i18n("语音降噪进程输出信息"))
713
- gr.Markdown(value=i18n("0c-中文批量离线ASR工具"))
714
- with gr.Row():
715
- open_asr_button = gr.Button(i18n("开启离线批量ASR"), variant="primary",visible=True)
716
- close_asr_button = gr.Button(i18n("终止ASR进程"), variant="primary",visible=False)
717
- with gr.Column():
718
- with gr.Row():
719
- asr_inp_dir = gr.Textbox(
720
- label=i18n("输入文件夹路径"),
721
- value="output/slicer_opt",
722
- interactive=True,
723
- )
724
- asr_opt_dir = gr.Textbox(
725
- label = i18n("输出文件夹路径"),
726
- value = "output/asr_opt",
727
- interactive = True,
728
- )
729
- with gr.Row():
730
- asr_model = gr.Dropdown(
731
- label = i18n("ASR 模型"),
732
- choices = list(asr_dict.keys()),
733
- interactive = True,
734
- value="达摩 ASR (中文)"
735
- )
736
- asr_size = gr.Dropdown(
737
- label = i18n("ASR 模型尺寸"),
738
- choices = ["large"],
739
- interactive = True,
740
- value="large"
741
- )
742
- asr_lang = gr.Dropdown(
743
- label = i18n("ASR 语言设置"),
744
- choices = ["zh"],
745
- interactive = True,
746
- value="zh"
747
- )
748
- with gr.Row():
749
- asr_info = gr.Textbox(label=i18n("ASR进程输出信息"))
750
-
751
- def change_lang_choices(key): #根据选择的模型修改可选的语言
752
- # return gr.Dropdown(choices=asr_dict[key]['lang'])
753
- return {"__type__": "update", "choices": asr_dict[key]['lang'],"value":asr_dict[key]['lang'][0]}
754
- def change_size_choices(key): # 根据选择的模型修改可选的模型尺寸
755
- # return gr.Dropdown(choices=asr_dict[key]['size'])
756
- return {"__type__": "update", "choices": asr_dict[key]['size']}
757
- asr_model.change(change_lang_choices, [asr_model], [asr_lang])
758
- asr_model.change(change_size_choices, [asr_model], [asr_size])
759
-
760
- gr.Markdown(value=i18n("0d-语音文本校对标注工具"))
761
- with gr.Row():
762
- if_label = gr.Checkbox(label=i18n("是否开启打标WebUI"),show_label=True)
763
- path_list = gr.Textbox(
764
- label=i18n(".list标注文件的路径"),
765
- value="output/asr_opt/slicer_opt.list",
766
- interactive=True,
767
- )
768
- label_info = gr.Textbox(label=i18n("打标工具进程输出信息"))
769
- if_label.change(change_label, [if_label,path_list], [label_info])
770
- if_uvr5.change(change_uvr5, [if_uvr5], [uvr5_info])
771
- open_asr_button.click(open_asr, [asr_inp_dir, asr_opt_dir, asr_model, asr_size, asr_lang], [asr_info,open_asr_button,close_asr_button])
772
- close_asr_button.click(close_asr, [], [asr_info,open_asr_button,close_asr_button])
773
- open_slicer_button.click(open_slice, [slice_inp_path,slice_opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,n_process], [slicer_info,open_slicer_button,close_slicer_button])
774
- close_slicer_button.click(close_slice, [], [slicer_info,open_slicer_button,close_slicer_button])
775
- open_denoise_button.click(open_denoise, [denoise_input_dir,denoise_output_dir], [denoise_info,open_denoise_button,close_denoise_button])
776
- close_denoise_button.click(close_denoise, [], [denoise_info,open_denoise_button,close_denoise_button])
777
-
778
- with gr.TabItem(i18n("1-GPT-SoVITS-TTS")):
779
- with gr.Row():
780
- exp_name = gr.Textbox(label=i18n("*实验/模型名"), value="xxx", interactive=True)
781
- gpu_info = gr.Textbox(label=i18n("显卡信息"), value=gpu_info, visible=True, interactive=False)
782
- pretrained_s2G = gr.Textbox(label=i18n("预训练的SoVITS-G模型路径"), value="GPT_SoVITS/pretrained_models/s2G488k.pth", interactive=True)
783
- pretrained_s2D = gr.Textbox(label=i18n("预训练的SoVITS-D模型路径"), value="GPT_SoVITS/pretrained_models/s2D488k.pth", interactive=True)
784
- pretrained_s1 = gr.Textbox(label=i18n("预训练的GPT模型路径"), value="GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt", interactive=True)
785
- with gr.TabItem(i18n("1A-训练集格式化工具")):
786
- gr.Markdown(value=i18n("输出logs/实验名目录下应有23456开头的文件和文件夹"))
787
- with gr.Row():
788
- inp_text = gr.Textbox(label=i18n("*文本标注文件"),value="output/asr_opt/slicer_opt.list",interactive=True)
789
- inp_wav_dir = gr.Textbox(
790
- label=i18n("*训练集音频文件目录"),
791
- value="output/slicer_opt",
792
- interactive=True,
793
- placeholder=i18n("填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名(不是全路径)。如果留空则使用.list文件里的绝对全路径。")
794
- )
795
- gr.Markdown(value=i18n("1Aa-文本内容"))
796
- with gr.Row():
797
- gpu_numbers1a = gr.Textbox(label=i18n("GPU卡号以-分割,每个卡号一个进程"),value="%s-%s"%(gpus,gpus),interactive=True)
798
- bert_pretrained_dir = gr.Textbox(label=i18n("预训练的中文BERT模型路径"),value="GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",interactive=False)
799
- button1a_open = gr.Button(i18n("开启文本获取"), variant="primary",visible=True)
800
- button1a_close = gr.Button(i18n("终止文本获取进程"), variant="primary",visible=False)
801
- info1a=gr.Textbox(label=i18n("文本进程输出信息"))
802
- gr.Markdown(value=i18n("1Ab-SSL自监督特征提取"))
803
- with gr.Row():
804
- gpu_numbers1Ba = gr.Textbox(label=i18n("GPU卡号以-分割,每个卡号一个进程"),value="%s-%s"%(gpus,gpus),interactive=True)
805
- cnhubert_base_dir = gr.Textbox(label=i18n("预训练的SSL模型路径"),value="GPT_SoVITS/pretrained_models/chinese-hubert-base",interactive=False)
806
- button1b_open = gr.Button(i18n("开启SSL提取"), variant="primary",visible=True)
807
- button1b_close = gr.Button(i18n("终止SSL提取进程"), variant="primary",visible=False)
808
- info1b=gr.Textbox(label=i18n("SSL进程输出信息"))
809
- gr.Markdown(value=i18n("1Ac-语义token提取"))
810
- with gr.Row():
811
- gpu_numbers1c = gr.Textbox(label=i18n("GPU卡号以-分割,每个卡号一个进程"),value="%s-%s"%(gpus,gpus),interactive=True)
812
- button1c_open = gr.Button(i18n("开启语义token提取"), variant="primary",visible=True)
813
- button1c_close = gr.Button(i18n("终止语义token提取进程"), variant="primary",visible=False)
814
- info1c=gr.Textbox(label=i18n("语义token提取进程输出信息"))
815
- gr.Markdown(value=i18n("1Aabc-训练集格式化一键三连"))
816
- with gr.Row():
817
- button1abc_open = gr.Button(i18n("开启一键三连"), variant="primary",visible=True)
818
- button1abc_close = gr.Button(i18n("终止一键三连"), variant="primary",visible=False)
819
- info1abc=gr.Textbox(label=i18n("一键三连进程输出信息"))
820
- button1a_open.click(open1a, [inp_text,inp_wav_dir,exp_name,gpu_numbers1a,bert_pretrained_dir], [info1a,button1a_open,button1a_close])
821
- button1a_close.click(close1a, [], [info1a,button1a_open,button1a_close])
822
- button1b_open.click(open1b, [inp_text,inp_wav_dir,exp_name,gpu_numbers1Ba,cnhubert_base_dir], [info1b,button1b_open,button1b_close])
823
- button1b_close.click(close1b, [], [info1b,button1b_open,button1b_close])
824
- button1c_open.click(open1c, [inp_text,exp_name,gpu_numbers1c,pretrained_s2G], [info1c,button1c_open,button1c_close])
825
- button1c_close.click(close1c, [], [info1c,button1c_open,button1c_close])
826
- button1abc_open.click(open1abc, [inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numbers1c,bert_pretrained_dir,cnhubert_base_dir,pretrained_s2G], [info1abc,button1abc_open,button1abc_close])
827
- button1abc_close.click(close1abc, [], [info1abc,button1abc_open,button1abc_close])
828
- with gr.TabItem(i18n("1B-微调训练")):
829
- gr.Markdown(value=i18n("1Ba-SoVITS训练。用于分享的模型文件输出在SoVITS_weights下。"))
830
- with gr.Row():
831
- batch_size = gr.Slider(minimum=1,maximum=40,step=1,label=i18n("每张显卡的batch_size"),value=default_batch_size,interactive=True)
832
- total_epoch = gr.Slider(minimum=1,maximum=25,step=1,label=i18n("总训练轮数total_epoch,不建议太高"),value=8,interactive=True)
833
- text_low_lr_rate = gr.Slider(minimum=0.2,maximum=0.6,step=0.05,label=i18n("文本模块学习率权重"),value=0.4,interactive=True)
834
- save_every_epoch = gr.Slider(minimum=1,maximum=25,step=1,label=i18n("保存频率save_every_epoch"),value=4,interactive=True)
835
- if_save_latest = gr.Checkbox(label=i18n("是否仅保存最新的ckpt文件以节省硬盘空间"), value=True, interactive=True, show_label=True)
836
- if_save_every_weights = gr.Checkbox(label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"), value=True, interactive=True, show_label=True)
837
- gpu_numbers1Ba = gr.Textbox(label=i18n("GPU卡号以-分割,每个卡号一个进程"), value="%s" % (gpus), interactive=True)
838
- with gr.Row():
839
- button1Ba_open = gr.Button(i18n("开启SoVITS训练"), variant="primary",visible=True)
840
- button1Ba_close = gr.Button(i18n("终止SoVITS训练"), variant="primary",visible=False)
841
- info1Ba=gr.Textbox(label=i18n("SoVITS训练进程输出信息"))
842
- gr.Markdown(value=i18n("1Bb-GPT训练。用于分享的模型文件输出在GPT_weights下。"))
843
- with gr.Row():
844
- batch_size1Bb = gr.Slider(minimum=1,maximum=40,step=1,label=i18n("每张显卡的batch_size"),value=default_batch_size,interactive=True)
845
- total_epoch1Bb = gr.Slider(minimum=2,maximum=50,step=1,label=i18n("总训练轮数total_epoch"),value=15,interactive=True)
846
- if_dpo = gr.Checkbox(label=i18n("是否开启dpo训练选项(实验性)"), value=False, interactive=True, show_label=True)
847
- if_save_latest1Bb = gr.Checkbox(label=i18n("是否仅保存最新的ckpt文件以节省硬盘空间"), value=True, interactive=True, show_label=True)
848
- if_save_every_weights1Bb = gr.Checkbox(label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"), value=True, interactive=True, show_label=True)
849
- save_every_epoch1Bb = gr.Slider(minimum=1,maximum=50,step=1,label=i18n("保存频率save_every_epoch"),value=5,interactive=True)
850
- gpu_numbers1Bb = gr.Textbox(label=i18n("GPU卡号以-分割,每个卡号一个进程"), value="%s" % (gpus), interactive=True)
851
- with gr.Row():
852
- button1Bb_open = gr.Button(i18n("开启GPT训练"), variant="primary",visible=True)
853
- button1Bb_close = gr.Button(i18n("终止GPT训练"), variant="primary",visible=False)
854
- info1Bb=gr.Textbox(label=i18n("GPT训练进程输出信息"))
855
- button1Ba_open.click(open1Ba, [batch_size,total_epoch,exp_name,text_low_lr_rate,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers1Ba,pretrained_s2G,pretrained_s2D], [info1Ba,button1Ba_open,button1Ba_close])
856
- button1Ba_close.click(close1Ba, [], [info1Ba,button1Ba_open,button1Ba_close])
857
- button1Bb_open.click(open1Bb, [batch_size1Bb,total_epoch1Bb,exp_name,if_dpo,if_save_latest1Bb,if_save_every_weights1Bb,save_every_epoch1Bb,gpu_numbers1Bb,pretrained_s1], [info1Bb,button1Bb_open,button1Bb_close])
858
- button1Bb_close.click(close1Bb, [], [info1Bb,button1Bb_open,button1Bb_close])
859
- with gr.TabItem(i18n("1C-推理")):
860
- gr.Markdown(value=i18n("选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。"))
861
- with gr.Row():
862
- GPT_dropdown = gr.Dropdown(label=i18n("*GPT模型列表"), choices=sorted(GPT_names,key=custom_sort_key),value=pretrained_gpt_name,interactive=True)
863
- SoVITS_dropdown = gr.Dropdown(label=i18n("*SoVITS模型列表"), choices=sorted(SoVITS_names,key=custom_sort_key),value=pretrained_sovits_name,interactive=True)
864
- gpu_number_1C=gr.Textbox(label=i18n("GPU卡号,只能填1个整数"), value=gpus, interactive=True)
865
- refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary")
866
- refresh_button.click(fn=change_choices,inputs=[],outputs=[SoVITS_dropdown,GPT_dropdown])
867
- with gr.Row():
868
- if_tts = gr.Checkbox(label=i18n("是否开启TTS推理WebUI"), show_label=True)
869
- tts_info = gr.Textbox(label=i18n("TTS推理WebUI进程输出信息"))
870
- if_tts.change(change_tts_inference, [if_tts,bert_pretrained_dir,cnhubert_base_dir,gpu_number_1C,GPT_dropdown,SoVITS_dropdown], [tts_info])
871
- with gr.TabItem(i18n("2-GPT-SoVITS-变声")):gr.Markdown(value=i18n("施工中,请静候佳音"))
872
- app.queue(concurrency_count=511, max_size=1022).launch(
873
- server_name="0.0.0.0",
874
- inbrowser=True,
875
- share=True,
876
- server_port=webui_port_main,
877
- quiet=True,
878
- )
 
1
+ import os,shutil,sys,pdb,re
2
+ now_dir = os.getcwd()
3
+ sys.path.insert(0, now_dir)
4
+ import json,yaml,warnings,torch
5
+ import platform
6
+ import psutil
7
+ import signal
8
+
9
+ warnings.filterwarnings("ignore")
10
+ torch.manual_seed(233333)
11
+ tmp = os.path.join(now_dir, "TEMP")
12
+ os.makedirs(tmp, exist_ok=True)
13
+ os.environ["TEMP"] = tmp
14
+ if(os.path.exists(tmp)):
15
+ for name in os.listdir(tmp):
16
+ if(name=="jieba.cache"):continue
17
+ path="%s/%s"%(tmp,name)
18
+ delete=os.remove if os.path.isfile(path) else shutil.rmtree
19
+ try:
20
+ delete(path)
21
+ except Exception as e:
22
+ print(str(e))
23
+ pass
24
+ import site
25
+ site_packages_roots = []
26
+ for path in site.getsitepackages():
27
+ if "packages" in path:
28
+ site_packages_roots.append(path)
29
+ if(site_packages_roots==[]):site_packages_roots=["%s/runtime/Lib/site-packages" % now_dir]
30
+ #os.environ["OPENBLAS_NUM_THREADS"] = "4"
31
+ os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1"
32
+ os.environ["all_proxy"] = ""
33
+ for site_packages_root in site_packages_roots:
34
+ if os.path.exists(site_packages_root):
35
+ try:
36
+ with open("%s/users.pth" % (site_packages_root), "w") as f:
37
+ f.write(
38
+ "%s\n%s/tools\n%s/tools/damo_asr\n%s/GPT_SoVITS\n%s/tools/uvr5"
39
+ % (now_dir, now_dir, now_dir, now_dir, now_dir)
40
+ )
41
+ break
42
+ except PermissionError:
43
+ pass
44
+ from tools import my_utils
45
+ import traceback
46
+ import shutil
47
+ import pdb
48
+ import gradio as gr
49
+ from subprocess import Popen
50
+ import signal
51
+ from config import python_exec,infer_device,is_half,exp_root,webui_port_main,webui_port_infer_tts,webui_port_uvr5,webui_port_subfix,is_share
52
+ from tools.i18n.i18n import I18nAuto
53
+ i18n = I18nAuto()
54
+ from scipy.io import wavfile
55
+ from tools.my_utils import load_audio
56
+ from multiprocessing import cpu_count
57
+
58
+ # os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 当遇到mps不支持的步骤时使用cpu
59
+
60
+ n_cpu=cpu_count()
61
+
62
+ ngpu = torch.cuda.device_count()
63
+ gpu_infos = []
64
+ mem = []
65
+ if_gpu_ok = False
66
+
67
+ # 判断是否有能用来训练和加速推理的N卡
68
+ if torch.cuda.is_available() or ngpu != 0:
69
+ for i in range(ngpu):
70
+ gpu_name = torch.cuda.get_device_name(i)
71
+ if any(value in gpu_name.upper()for value in ["10","16","20","30","40","A2","A3","A4","P4","A50","500","A60","70","80","90","M4","T4","TITAN","L4","4060"]):
72
+ # A10#A100#V100#A40#P40#M40#K80#A4500
73
+ if_gpu_ok = True # 至少有一张能用的N卡
74
+ gpu_infos.append("%s\t%s" % (i, gpu_name))
75
+ mem.append(int(torch.cuda.get_device_properties(i).total_memory/ 1024/ 1024/ 1024+ 0.4))
76
+ # # 判断是否支持mps加速
77
+ # if torch.backends.mps.is_available():
78
+ # if_gpu_ok = True
79
+ # gpu_infos.append("%s\t%s" % ("0", "Apple GPU"))
80
+ # mem.append(psutil.virtual_memory().total/ 1024 / 1024 / 1024) # 实测使用系统内存作为显存不会爆显存
81
+
82
+ if if_gpu_ok and len(gpu_infos) > 0:
83
+ gpu_info = "\n".join(gpu_infos)
84
+ default_batch_size = min(mem) // 2
85
+ else:
86
+ gpu_info = ("%s\t%s" % ("0", "CPU"))
87
+ gpu_infos.append("%s\t%s" % ("0", "CPU"))
88
+ default_batch_size = psutil.virtual_memory().total/ 1024 / 1024 / 1024 / 2
89
+ gpus = "-".join([i[0] for i in gpu_infos])
90
+
91
+ pretrained_sovits_name="GPT_SoVITS/pretrained_models/s2G488k.pth"
92
+ pretrained_gpt_name="GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
93
+ def get_weights_names():
94
+ SoVITS_names = [pretrained_sovits_name]
95
+ for name in os.listdir(SoVITS_weight_root):
96
+ if name.endswith(".pth"):SoVITS_names.append(name)
97
+ GPT_names = [pretrained_gpt_name]
98
+ for name in os.listdir(GPT_weight_root):
99
+ if name.endswith(".ckpt"): GPT_names.append(name)
100
+ return SoVITS_names,GPT_names
101
+ SoVITS_weight_root="SoVITS_weights"
102
+ GPT_weight_root="GPT_weights"
103
+ os.makedirs(SoVITS_weight_root,exist_ok=True)
104
+ os.makedirs(GPT_weight_root,exist_ok=True)
105
+ SoVITS_names,GPT_names = get_weights_names()
106
+
107
+ def custom_sort_key(s):
108
+ # 使用正则表达式提取字符串中的数字部分和非数字部分
109
+ parts = re.split('(\d+)', s)
110
+ # 将数字部分转换为整数,非数字部分保持不变
111
+ parts = [int(part) if part.isdigit() else part for part in parts]
112
+ return parts
113
+
114
+ def change_choices():
115
+ SoVITS_names, GPT_names = get_weights_names()
116
+ return {"choices": sorted(SoVITS_names,key=custom_sort_key), "__type__": "update"}, {"choices": sorted(GPT_names,key=custom_sort_key), "__type__": "update"}
117
+
118
+ p_label=None
119
+ p_uvr5=None
120
+ p_asr=None
121
+ p_denoise=None
122
+ p_tts_inference=None
123
+
124
+ def kill_proc_tree(pid, including_parent=True):
125
+ try:
126
+ parent = psutil.Process(pid)
127
+ except psutil.NoSuchProcess:
128
+ # Process already terminated
129
+ return
130
+
131
+ children = parent.children(recursive=True)
132
+ for child in children:
133
+ try:
134
+ os.kill(child.pid, signal.SIGTERM) # or signal.SIGKILL
135
+ except OSError:
136
+ pass
137
+ if including_parent:
138
+ try:
139
+ os.kill(parent.pid, signal.SIGTERM) # or signal.SIGKILL
140
+ except OSError:
141
+ pass
142
+
143
+ system=platform.system()
144
+ def kill_process(pid):
145
+ if(system=="Windows"):
146
+ cmd = "taskkill /t /f /pid %s" % pid
147
+ os.system(cmd)
148
+ else:
149
+ kill_proc_tree(pid)
150
+
151
+
152
+ def change_label(if_label,path_list):
153
+ global p_label
154
+ if(if_label==True and p_label==None):
155
+ path_list=my_utils.clean_path(path_list)
156
+ cmd = '"%s" tools/subfix_webui.py --load_list "%s" --webui_port %s --is_share %s'%(python_exec,path_list,webui_port_subfix,is_share)
157
+ yield i18n("打标工具WebUI已开启")
158
+ print(cmd)
159
+ p_label = Popen(cmd, shell=True)
160
+ elif(if_label==False and p_label!=None):
161
+ kill_process(p_label.pid)
162
+ p_label=None
163
+ yield i18n("打标工具WebUI已关闭")
164
+
165
+ def change_uvr5(if_uvr5):
166
+ global p_uvr5
167
+ if(if_uvr5==True and p_uvr5==None):
168
+ cmd = '"%s" tools/uvr5/webui.py "%s" %s %s %s'%(python_exec,infer_device,is_half,webui_port_uvr5,is_share)
169
+ yield i18n("UVR5已开启")
170
+ print(cmd)
171
+ p_uvr5 = Popen(cmd, shell=True)
172
+ elif(if_uvr5==False and p_uvr5!=None):
173
+ kill_process(p_uvr5.pid)
174
+ p_uvr5=None
175
+ yield i18n("UVR5已关闭")
176
+
177
+ def change_tts_inference(if_tts,bert_path,cnhubert_base_path,gpu_number,gpt_path,sovits_path):
178
+ global p_tts_inference
179
+ if(if_tts==True and p_tts_inference==None):
180
+ os.environ["gpt_path"]=gpt_path if "/" in gpt_path else "%s/%s"%(GPT_weight_root,gpt_path)
181
+ os.environ["sovits_path"]=sovits_path if "/"in sovits_path else "%s/%s"%(SoVITS_weight_root,sovits_path)
182
+ os.environ["cnhubert_base_path"]=cnhubert_base_path
183
+ os.environ["bert_path"]=bert_path
184
+ os.environ["_CUDA_VISIBLE_DEVICES"]=gpu_number
185
+ os.environ["is_half"]=str(is_half)
186
+ os.environ["infer_ttswebui"]=str(webui_port_infer_tts)
187
+ os.environ["is_share"]=str(is_share)
188
+ cmd = '"%s" GPT_SoVITS/inference_webui.py'%(python_exec)
189
+ yield i18n("TTS推理进程已开启")
190
+ print(cmd)
191
+ p_tts_inference = Popen(cmd, shell=True)
192
+ elif(if_tts==False and p_tts_inference!=None):
193
+ kill_process(p_tts_inference.pid)
194
+ p_tts_inference=None
195
+ yield i18n("TTS推理进程已关闭")
196
+
197
+ from tools.asr.config import asr_dict
198
+ def open_asr(asr_inp_dir, asr_opt_dir, asr_model, asr_model_size, asr_lang):
199
+ global p_asr
200
+ if(p_asr==None):
201
+ asr_inp_dir=my_utils.clean_path(asr_inp_dir)
202
+ cmd = f'"{python_exec}" tools/asr/{asr_dict[asr_model]["path"]}'
203
+ cmd += f' -i "{asr_inp_dir}"'
204
+ cmd += f' -o "{asr_opt_dir}"'
205
+ cmd += f' -s {asr_model_size}'
206
+ cmd += f' -l {asr_lang}'
207
+ cmd += " -p %s"%("float16"if is_half==True else "float32")
208
+
209
+ yield "ASR任务开启:%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True}
210
+ print(cmd)
211
+ p_asr = Popen(cmd, shell=True)
212
+ p_asr.wait()
213
+ p_asr=None
214
+ yield f"ASR任务完成, 查看终端进行下一步",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
215
+ else:
216
+ yield "已有正在进行的ASR任务,需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True}
217
+ # return None
218
+
219
+ def close_asr():
220
+ global p_asr
221
+ if(p_asr!=None):
222
+ kill_process(p_asr.pid)
223
+ p_asr=None
224
+ return "已终止ASR进程",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
225
+ def open_denoise(denoise_inp_dir, denoise_opt_dir):
226
+ global p_denoise
227
+ if(p_denoise==None):
228
+ denoise_inp_dir=my_utils.clean_path(denoise_inp_dir)
229
+ denoise_opt_dir=my_utils.clean_path(denoise_opt_dir)
230
+ cmd = '"%s" tools/cmd-denoise.py -i "%s" -o "%s" -p %s'%(python_exec,denoise_inp_dir,denoise_opt_dir,"float16"if is_half==True else "float32")
231
+
232
+ yield "语音降噪任务开启:%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True}
233
+ print(cmd)
234
+ p_denoise = Popen(cmd, shell=True)
235
+ p_denoise.wait()
236
+ p_denoise=None
237
+ yield f"语音降噪任务完成, 查看终端进行下一步",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
238
+ else:
239
+ yield "已有正在进行的语音降噪任务,需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True}
240
+ # return None
241
+
242
+ def close_denoise():
243
+ global p_denoise
244
+ if(p_denoise!=None):
245
+ kill_process(p_denoise.pid)
246
+ p_denoise=None
247
+ return "已终止语音降噪进程",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
248
+
249
+ p_train_SoVITS=None
250
+ def open1Ba(batch_size,total_epoch,exp_name,text_low_lr_rate,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers1Ba,pretrained_s2G,pretrained_s2D):
251
+ global p_train_SoVITS
252
+ if(p_train_SoVITS==None):
253
+ with open("GPT_SoVITS/configs/s2.json")as f:
254
+ data=f.read()
255
+ data=json.loads(data)
256
+ s2_dir="%s/%s"%(exp_root,exp_name)
257
+ os.makedirs("%s/logs_s2"%(s2_dir),exist_ok=True)
258
+ if(is_half==False):
259
+ data["train"]["fp16_run"]=False
260
+ batch_size=max(1,batch_size//2)
261
+ data["train"]["batch_size"]=batch_size
262
+ data["train"]["epochs"]=total_epoch
263
+ data["train"]["text_low_lr_rate"]=text_low_lr_rate
264
+ data["train"]["pretrained_s2G"]=pretrained_s2G
265
+ data["train"]["pretrained_s2D"]=pretrained_s2D
266
+ data["train"]["if_save_latest"]=if_save_latest
267
+ data["train"]["if_save_every_weights"]=if_save_every_weights
268
+ data["train"]["save_every_epoch"]=save_every_epoch
269
+ data["train"]["gpu_numbers"]=gpu_numbers1Ba
270
+ data["data"]["exp_dir"]=data["s2_ckpt_dir"]=s2_dir
271
+ data["save_weight_dir"]=SoVITS_weight_root
272
+ data["name"]=exp_name
273
+ tmp_config_path="%s/tmp_s2.json"%tmp
274
+ with open(tmp_config_path,"w")as f:f.write(json.dumps(data))
275
+
276
+ cmd = '"%s" GPT_SoVITS/s2_train.py --config "%s"'%(python_exec,tmp_config_path)
277
+ yield "SoVITS训练开始:%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True}
278
+ print(cmd)
279
+ p_train_SoVITS = Popen(cmd, shell=True)
280
+ p_train_SoVITS.wait()
281
+ p_train_SoVITS=None
282
+ yield "SoVITS训练完成",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
283
+ else:
284
+ yield "已有正在进行的SoVITS训练任务,需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True}
285
+
286
+ def close1Ba():
287
+ global p_train_SoVITS
288
+ if(p_train_SoVITS!=None):
289
+ kill_process(p_train_SoVITS.pid)
290
+ p_train_SoVITS=None
291
+ return "已终止SoVITS训练",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
292
+
293
+ p_train_GPT=None
294
+ def open1Bb(batch_size,total_epoch,exp_name,if_dpo,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers,pretrained_s1):
295
+ global p_train_GPT
296
+ if(p_train_GPT==None):
297
+ with open("GPT_SoVITS/configs/s1longer.yaml")as f:
298
+ data=f.read()
299
+ data=yaml.load(data, Loader=yaml.FullLoader)
300
+ s1_dir="%s/%s"%(exp_root,exp_name)
301
+ os.makedirs("%s/logs_s1"%(s1_dir),exist_ok=True)
302
+ if(is_half==False):
303
+ data["train"]["precision"]="32"
304
+ batch_size = max(1, batch_size // 2)
305
+ data["train"]["batch_size"]=batch_size
306
+ data["train"]["epochs"]=total_epoch
307
+ data["pretrained_s1"]=pretrained_s1
308
+ data["train"]["save_every_n_epoch"]=save_every_epoch
309
+ data["train"]["if_save_every_weights"]=if_save_every_weights
310
+ data["train"]["if_save_latest"]=if_save_latest
311
+ data["train"]["if_dpo"]=if_dpo
312
+ data["train"]["half_weights_save_dir"]=GPT_weight_root
313
+ data["train"]["exp_name"]=exp_name
314
+ data["train_semantic_path"]="%s/6-name2semantic.tsv"%s1_dir
315
+ data["train_phoneme_path"]="%s/2-name2text.txt"%s1_dir
316
+ data["output_dir"]="%s/logs_s1"%s1_dir
317
+
318
+ os.environ["_CUDA_VISIBLE_DEVICES"]=gpu_numbers.replace("-",",")
319
+ os.environ["hz"]="25hz"
320
+ tmp_config_path="%s/tmp_s1.yaml"%tmp
321
+ with open(tmp_config_path, "w") as f:f.write(yaml.dump(data, default_flow_style=False))
322
+ # cmd = '"%s" GPT_SoVITS/s1_train.py --config_file "%s" --train_semantic_path "%s/6-name2semantic.tsv" --train_phoneme_path "%s/2-name2text.txt" --output_dir "%s/logs_s1"'%(python_exec,tmp_config_path,s1_dir,s1_dir,s1_dir)
323
+ cmd = '"%s" GPT_SoVITS/s1_train.py --config_file "%s" '%(python_exec,tmp_config_path)
324
+ yield "GPT训练开始:%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True}
325
+ print(cmd)
326
+ p_train_GPT = Popen(cmd, shell=True)
327
+ p_train_GPT.wait()
328
+ p_train_GPT=None
329
+ yield "GPT训练完成",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
330
+ else:
331
+ yield "已有正在进行的GPT训练任务,需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True}
332
+
333
+ def close1Bb():
334
+ global p_train_GPT
335
+ if(p_train_GPT!=None):
336
+ kill_process(p_train_GPT.pid)
337
+ p_train_GPT=None
338
+ return "已终止GPT训练",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
339
+
340
+ ps_slice=[]
341
+ def open_slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,n_parts):
342
+ global ps_slice
343
+ inp = my_utils.clean_path(inp)
344
+ opt_root = my_utils.clean_path(opt_root)
345
+ if(os.path.exists(inp)==False):
346
+ yield "输入路径不存在",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
347
+ return
348
+ if os.path.isfile(inp):n_parts=1
349
+ elif os.path.isdir(inp):pass
350
+ else:
351
+ yield "输入路径存在但既不是文件也不是文件夹",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
352
+ return
353
+ if (ps_slice == []):
354
+ for i_part in range(n_parts):
355
+ cmd = '"%s" tools/slice_audio.py "%s" "%s" %s %s %s %s %s %s %s %s %s''' % (python_exec,inp, opt_root, threshold, min_length, min_interval, hop_size, max_sil_kept, _max, alpha, i_part, n_parts)
356
+ print(cmd)
357
+ p = Popen(cmd, shell=True)
358
+ ps_slice.append(p)
359
+ yield "切割执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
360
+ for p in ps_slice:
361
+ p.wait()
362
+ ps_slice=[]
363
+ yield "切割结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
364
+ else:
365
+ yield "已有正在进行的切割任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
366
+
367
+ def close_slice():
368
+ global ps_slice
369
+ if (ps_slice != []):
370
+ for p_slice in ps_slice:
371
+ try:
372
+ kill_process(p_slice.pid)
373
+ except:
374
+ traceback.print_exc()
375
+ ps_slice=[]
376
+ return "已终止所有切割进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
377
+
378
+ ps1a=[]
379
+ def open1a(inp_text,inp_wav_dir,exp_name,gpu_numbers,bert_pretrained_dir):
380
+ global ps1a
381
+ inp_text = my_utils.clean_path(inp_text)
382
+ inp_wav_dir = my_utils.clean_path(inp_wav_dir)
383
+ if (ps1a == []):
384
+ opt_dir="%s/%s"%(exp_root,exp_name)
385
+ config={
386
+ "inp_text":inp_text,
387
+ "inp_wav_dir":inp_wav_dir,
388
+ "exp_name":exp_name,
389
+ "opt_dir":opt_dir,
390
+ "bert_pretrained_dir":bert_pretrained_dir,
391
+ }
392
+ gpu_names=gpu_numbers.split("-")
393
+ all_parts=len(gpu_names)
394
+ for i_part in range(all_parts):
395
+ config.update(
396
+ {
397
+ "i_part": str(i_part),
398
+ "all_parts": str(all_parts),
399
+ "_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
400
+ "is_half": str(is_half)
401
+ }
402
+ )
403
+ os.environ.update(config)
404
+ cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py'%python_exec
405
+ print(cmd)
406
+ p = Popen(cmd, shell=True)
407
+ ps1a.append(p)
408
+ yield "文本进程执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
409
+ for p in ps1a:
410
+ p.wait()
411
+ opt = []
412
+ for i_part in range(all_parts):
413
+ txt_path = "%s/2-name2text-%s.txt" % (opt_dir, i_part)
414
+ with open(txt_path, "r", encoding="utf8") as f:
415
+ opt += f.read().strip("\n").split("\n")
416
+ os.remove(txt_path)
417
+ path_text = "%s/2-name2text.txt" % opt_dir
418
+ with open(path_text, "w", encoding="utf8") as f:
419
+ f.write("\n".join(opt) + "\n")
420
+ ps1a=[]
421
+ yield "文本进程结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
422
+ else:
423
+ yield "已有正在进行的文本任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
424
+
425
+ def close1a():
426
+ global ps1a
427
+ if (ps1a != []):
428
+ for p1a in ps1a:
429
+ try:
430
+ kill_process(p1a.pid)
431
+ except:
432
+ traceback.print_exc()
433
+ ps1a=[]
434
+ return "已终止所有1a进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
435
+
436
+ ps1b=[]
437
+ def open1b(inp_text,inp_wav_dir,exp_name,gpu_numbers,ssl_pretrained_dir):
438
+ global ps1b
439
+ inp_text = my_utils.clean_path(inp_text)
440
+ inp_wav_dir = my_utils.clean_path(inp_wav_dir)
441
+ if (ps1b == []):
442
+ config={
443
+ "inp_text":inp_text,
444
+ "inp_wav_dir":inp_wav_dir,
445
+ "exp_name":exp_name,
446
+ "opt_dir":"%s/%s"%(exp_root,exp_name),
447
+ "cnhubert_base_dir":ssl_pretrained_dir,
448
+ "is_half": str(is_half)
449
+ }
450
+ gpu_names=gpu_numbers.split("-")
451
+ all_parts=len(gpu_names)
452
+ for i_part in range(all_parts):
453
+ config.update(
454
+ {
455
+ "i_part": str(i_part),
456
+ "all_parts": str(all_parts),
457
+ "_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
458
+ }
459
+ )
460
+ os.environ.update(config)
461
+ cmd = '"%s" GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py'%python_exec
462
+ print(cmd)
463
+ p = Popen(cmd, shell=True)
464
+ ps1b.append(p)
465
+ yield "SSL提取进程执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
466
+ for p in ps1b:
467
+ p.wait()
468
+ ps1b=[]
469
+ yield "SSL提取进程结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
470
+ else:
471
+ yield "已有正在进行的SSL提取任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
472
+
473
+ def close1b():
474
+ global ps1b
475
+ if (ps1b != []):
476
+ for p1b in ps1b:
477
+ try:
478
+ kill_process(p1b.pid)
479
+ except:
480
+ traceback.print_exc()
481
+ ps1b=[]
482
+ return "已终止所有1b进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
483
+
484
+ ps1c=[]
485
+ def open1c(inp_text,exp_name,gpu_numbers,pretrained_s2G_path):
486
+ global ps1c
487
+ inp_text = my_utils.clean_path(inp_text)
488
+ if (ps1c == []):
489
+ opt_dir="%s/%s"%(exp_root,exp_name)
490
+ config={
491
+ "inp_text":inp_text,
492
+ "exp_name":exp_name,
493
+ "opt_dir":opt_dir,
494
+ "pretrained_s2G":pretrained_s2G_path,
495
+ "s2config_path":"GPT_SoVITS/configs/s2.json",
496
+ "is_half": str(is_half)
497
+ }
498
+ gpu_names=gpu_numbers.split("-")
499
+ all_parts=len(gpu_names)
500
+ for i_part in range(all_parts):
501
+ config.update(
502
+ {
503
+ "i_part": str(i_part),
504
+ "all_parts": str(all_parts),
505
+ "_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
506
+ }
507
+ )
508
+ os.environ.update(config)
509
+ cmd = '"%s" GPT_SoVITS/prepare_datasets/3-get-semantic.py'%python_exec
510
+ print(cmd)
511
+ p = Popen(cmd, shell=True)
512
+ ps1c.append(p)
513
+ yield "语义token提取进程执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
514
+ for p in ps1c:
515
+ p.wait()
516
+ opt = ["item_name\tsemantic_audio"]
517
+ path_semantic = "%s/6-name2semantic.tsv" % opt_dir
518
+ for i_part in range(all_parts):
519
+ semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part)
520
+ with open(semantic_path, "r", encoding="utf8") as f:
521
+ opt += f.read().strip("\n").split("\n")
522
+ os.remove(semantic_path)
523
+ with open(path_semantic, "w", encoding="utf8") as f:
524
+ f.write("\n".join(opt) + "\n")
525
+ ps1c=[]
526
+ yield "语义token提取进程结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
527
+ else:
528
+ yield "已有正在��行的语义token提取任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
529
+
530
+ def close1c():
531
+ global ps1c
532
+ if (ps1c != []):
533
+ for p1c in ps1c:
534
+ try:
535
+ kill_process(p1c.pid)
536
+ except:
537
+ traceback.print_exc()
538
+ ps1c=[]
539
+ return "已终止所有语义token进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
540
+ #####inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numbers1c,bert_pretrained_dir,cnhubert_base_dir,pretrained_s2G
541
+ ps1abc=[]
542
+ def open1abc(inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numbers1c,bert_pretrained_dir,ssl_pretrained_dir,pretrained_s2G_path):
543
+ global ps1abc
544
+ inp_text = my_utils.clean_path(inp_text)
545
+ inp_wav_dir = my_utils.clean_path(inp_wav_dir)
546
+ if (ps1abc == []):
547
+ opt_dir="%s/%s"%(exp_root,exp_name)
548
+ try:
549
+ #############################1a
550
+ path_text="%s/2-name2text.txt" % opt_dir
551
+ if(os.path.exists(path_text)==False or (os.path.exists(path_text)==True and len(open(path_text,"r",encoding="utf8").read().strip("\n").split("\n"))<2)):
552
+ config={
553
+ "inp_text":inp_text,
554
+ "inp_wav_dir":inp_wav_dir,
555
+ "exp_name":exp_name,
556
+ "opt_dir":opt_dir,
557
+ "bert_pretrained_dir":bert_pretrained_dir,
558
+ "is_half": str(is_half)
559
+ }
560
+ gpu_names=gpu_numbers1a.split("-")
561
+ all_parts=len(gpu_names)
562
+ for i_part in range(all_parts):
563
+ config.update(
564
+ {
565
+ "i_part": str(i_part),
566
+ "all_parts": str(all_parts),
567
+ "_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
568
+ }
569
+ )
570
+ os.environ.update(config)
571
+ cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py'%python_exec
572
+ print(cmd)
573
+ p = Popen(cmd, shell=True)
574
+ ps1abc.append(p)
575
+ yield "进度:1a-ing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
576
+ for p in ps1abc:p.wait()
577
+
578
+ opt = []
579
+ for i_part in range(all_parts):#txt_path="%s/2-name2text-%s.txt"%(opt_dir,i_part)
580
+ txt_path = "%s/2-name2text-%s.txt" % (opt_dir, i_part)
581
+ with open(txt_path, "r",encoding="utf8") as f:
582
+ opt += f.read().strip("\n").split("\n")
583
+ os.remove(txt_path)
584
+ with open(path_text, "w",encoding="utf8") as f:
585
+ f.write("\n".join(opt) + "\n")
586
+
587
+ yield "进度:1a-done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
588
+ ps1abc=[]
589
+ #############################1b
590
+ config={
591
+ "inp_text":inp_text,
592
+ "inp_wav_dir":inp_wav_dir,
593
+ "exp_name":exp_name,
594
+ "opt_dir":opt_dir,
595
+ "cnhubert_base_dir":ssl_pretrained_dir,
596
+ }
597
+ gpu_names=gpu_numbers1Ba.split("-")
598
+ all_parts=len(gpu_names)
599
+ for i_part in range(all_parts):
600
+ config.update(
601
+ {
602
+ "i_part": str(i_part),
603
+ "all_parts": str(all_parts),
604
+ "_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
605
+ }
606
+ )
607
+ os.environ.update(config)
608
+ cmd = '"%s" GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py'%python_exec
609
+ print(cmd)
610
+ p = Popen(cmd, shell=True)
611
+ ps1abc.append(p)
612
+ yield "进度:1a-done, 1b-ing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
613
+ for p in ps1abc:p.wait()
614
+ yield "进度:1a1b-done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
615
+ ps1abc=[]
616
+ #############################1c
617
+ path_semantic = "%s/6-name2semantic.tsv" % opt_dir
618
+ if(os.path.exists(path_semantic)==False or (os.path.exists(path_semantic)==True and os.path.getsize(path_semantic)<31)):
619
+ config={
620
+ "inp_text":inp_text,
621
+ "exp_name":exp_name,
622
+ "opt_dir":opt_dir,
623
+ "pretrained_s2G":pretrained_s2G_path,
624
+ "s2config_path":"GPT_SoVITS/configs/s2.json",
625
+ }
626
+ gpu_names=gpu_numbers1c.split("-")
627
+ all_parts=len(gpu_names)
628
+ for i_part in range(all_parts):
629
+ config.update(
630
+ {
631
+ "i_part": str(i_part),
632
+ "all_parts": str(all_parts),
633
+ "_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
634
+ }
635
+ )
636
+ os.environ.update(config)
637
+ cmd = '"%s" GPT_SoVITS/prepare_datasets/3-get-semantic.py'%python_exec
638
+ print(cmd)
639
+ p = Popen(cmd, shell=True)
640
+ ps1abc.append(p)
641
+ yield "进度:1a1b-done, 1cing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
642
+ for p in ps1abc:p.wait()
643
+
644
+ opt = ["item_name\tsemantic_audio"]
645
+ for i_part in range(all_parts):
646
+ semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part)
647
+ with open(semantic_path, "r",encoding="utf8") as f:
648
+ opt += f.read().strip("\n").split("\n")
649
+ os.remove(semantic_path)
650
+ with open(path_semantic, "w",encoding="utf8") as f:
651
+ f.write("\n".join(opt) + "\n")
652
+ yield "进度:all-done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
653
+ ps1abc = []
654
+ yield "一键三连进程结束", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
655
+ except:
656
+ traceback.print_exc()
657
+ close1abc()
658
+ yield "一键三连中途报错", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
659
+ else:
660
+ yield "已有正在进行的一键三连任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
661
+
662
+ def close1abc():
663
+ global ps1abc
664
+ if (ps1abc != []):
665
+ for p1abc in ps1abc:
666
+ try:
667
+ kill_process(p1abc.pid)
668
+ except:
669
+ traceback.print_exc()
670
+ ps1abc=[]
671
+ return "已终止所有一键三连进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
672
+
673
+ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
674
+ gr.Markdown(
675
+ value=
676
+ i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>.")
677
+ )
678
+ gr.Markdown(
679
+ value=
680
+ i18n("中文教程文档:https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e")
681
+ )
682
+
683
+ with gr.Tabs():
684
+ with gr.TabItem(i18n("0-前置数据集获取工具")):#提前随机切片防止uvr5爆内存->uvr5->slicer->asr->打标
685
+ gr.Markdown(value=i18n("0a-UVR5人声伴奏分离&去混响去延迟工具"))
686
+ with gr.Row():
687
+ if_uvr5 = gr.Checkbox(label=i18n("是否开启UVR5-WebUI"),show_label=True)
688
+ uvr5_info = gr.Textbox(label=i18n("UVR5进程输出信息"))
689
+ gr.Markdown(value=i18n("0b-语音切分工具"))
690
+ with gr.Row():
691
+ with gr.Row():
692
+ slice_inp_path=gr.Textbox(label=i18n("音频自动切分输入路径,可文件可文件夹"),value="")
693
+ slice_opt_root=gr.Textbox(label=i18n("切分后的子音频的输出根目录"),value="output/slicer_opt")
694
+ threshold=gr.Textbox(label=i18n("threshold:音量小于这个值视作静音的备选切割点"),value="-34")
695
+ min_length=gr.Textbox(label=i18n("min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值"),value="4000")
696
+ min_interval=gr.Textbox(label=i18n("min_interval:最短切割间隔"),value="300")
697
+ hop_size=gr.Textbox(label=i18n("hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)"),value="10")
698
+ max_sil_kept=gr.Textbox(label=i18n("max_sil_kept:切完后静音最多留多长"),value="500")
699
+ with gr.Row():
700
+ open_slicer_button=gr.Button(i18n("开启语音切割"), variant="primary",visible=True)
701
+ close_slicer_button=gr.Button(i18n("终止语音切割"), variant="primary",visible=False)
702
+ _max=gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("max:归一化后最大值多少"),value=0.9,interactive=True)
703
+ alpha=gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("alpha_mix:混多少比例归一化后音频进来"),value=0.25,interactive=True)
704
+ n_process=gr.Slider(minimum=1,maximum=n_cpu,step=1,label=i18n("切割使用的进程数"),value=4,interactive=True)
705
+ slicer_info = gr.Textbox(label=i18n("语音切割进���输出信息"))
706
+ gr.Markdown(value=i18n("0bb-语音降噪工具"))
707
+ with gr.Row():
708
+ open_denoise_button = gr.Button(i18n("开启语音降噪"), variant="primary",visible=True)
709
+ close_denoise_button = gr.Button(i18n("终止语音降噪进程"), variant="primary",visible=False)
710
+ denoise_input_dir=gr.Textbox(label=i18n("降噪音频文件输入文件夹"),value="")
711
+ denoise_output_dir=gr.Textbox(label=i18n("降噪结果输出文件夹"),value="output/denoise_opt")
712
+ denoise_info = gr.Textbox(label=i18n("语音降噪进程输出信息"))
713
+ gr.Markdown(value=i18n("0c-中文批量离线ASR工具"))
714
+ with gr.Row():
715
+ open_asr_button = gr.Button(i18n("开启离线批量ASR"), variant="primary",visible=True)
716
+ close_asr_button = gr.Button(i18n("终止ASR进程"), variant="primary",visible=False)
717
+ with gr.Column():
718
+ with gr.Row():
719
+ asr_inp_dir = gr.Textbox(
720
+ label=i18n("输入文件夹路径"),
721
+ value="D:\\GPT-SoVITS\\raw\\xxx",
722
+ interactive=True,
723
+ )
724
+ asr_opt_dir = gr.Textbox(
725
+ label = i18n("输出文件夹路径"),
726
+ value = "output/asr_opt",
727
+ interactive = True,
728
+ )
729
+ with gr.Row():
730
+ asr_model = gr.Dropdown(
731
+ label = i18n("ASR 模型"),
732
+ choices = list(asr_dict.keys()),
733
+ interactive = True,
734
+ value="达摩 ASR (中文)"
735
+ )
736
+ asr_size = gr.Dropdown(
737
+ label = i18n("ASR 模型尺寸"),
738
+ choices = ["large"],
739
+ interactive = True,
740
+ value="large"
741
+ )
742
+ asr_lang = gr.Dropdown(
743
+ label = i18n("ASR 语言设置"),
744
+ choices = ["zh"],
745
+ interactive = True,
746
+ value="zh"
747
+ )
748
+ with gr.Row():
749
+ asr_info = gr.Textbox(label=i18n("ASR进程输出信息"))
750
+
751
+ def change_lang_choices(key): #根据选择的模型修改可选的语言
752
+ # return gr.Dropdown(choices=asr_dict[key]['lang'])
753
+ return {"__type__": "update", "choices": asr_dict[key]['lang'],"value":asr_dict[key]['lang'][0]}
754
+ def change_size_choices(key): # 根据选择的模型修改可选的模型尺寸
755
+ # return gr.Dropdown(choices=asr_dict[key]['size'])
756
+ return {"__type__": "update", "choices": asr_dict[key]['size']}
757
+ asr_model.change(change_lang_choices, [asr_model], [asr_lang])
758
+ asr_model.change(change_size_choices, [asr_model], [asr_size])
759
+
760
+ gr.Markdown(value=i18n("0d-语音文本校对标注工具"))
761
+ with gr.Row():
762
+ if_label = gr.Checkbox(label=i18n("是否开启打标WebUI"),show_label=True)
763
+ path_list = gr.Textbox(
764
+ label=i18n(".list标注文件的路径"),
765
+ value="D:\\RVC1006\\GPT-SoVITS\\raw\\xxx.list",
766
+ interactive=True,
767
+ )
768
+ label_info = gr.Textbox(label=i18n("打标工具进程输出信息"))
769
+ if_label.change(change_label, [if_label,path_list], [label_info])
770
+ if_uvr5.change(change_uvr5, [if_uvr5], [uvr5_info])
771
+ open_asr_button.click(open_asr, [asr_inp_dir, asr_opt_dir, asr_model, asr_size, asr_lang], [asr_info,open_asr_button,close_asr_button])
772
+ close_asr_button.click(close_asr, [], [asr_info,open_asr_button,close_asr_button])
773
+ open_slicer_button.click(open_slice, [slice_inp_path,slice_opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,n_process], [slicer_info,open_slicer_button,close_slicer_button])
774
+ close_slicer_button.click(close_slice, [], [slicer_info,open_slicer_button,close_slicer_button])
775
+ open_denoise_button.click(open_denoise, [denoise_input_dir,denoise_output_dir], [denoise_info,open_denoise_button,close_denoise_button])
776
+ close_denoise_button.click(close_denoise, [], [denoise_info,open_denoise_button,close_denoise_button])
777
+
778
+ with gr.TabItem(i18n("1-GPT-SoVITS-TTS")):
779
+ with gr.Row():
780
+ exp_name = gr.Textbox(label=i18n("*实验/模型名"), value="xxx", interactive=True)
781
+ gpu_info = gr.Textbox(label=i18n("显卡信息"), value=gpu_info, visible=True, interactive=False)
782
+ pretrained_s2G = gr.Textbox(label=i18n("预训练的SoVITS-G模型路径"), value="GPT_SoVITS/pretrained_models/s2G488k.pth", interactive=True)
783
+ pretrained_s2D = gr.Textbox(label=i18n("预训练的SoVITS-D模型路径"), value="GPT_SoVITS/pretrained_models/s2D488k.pth", interactive=True)
784
+ pretrained_s1 = gr.Textbox(label=i18n("预训练的GPT模型路径"), value="GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt", interactive=True)
785
+ with gr.TabItem(i18n("1A-训练集格式化工具")):
786
+ gr.Markdown(value=i18n("输出logs/实验名目录下应有23456开头的文件和文件夹"))
787
+ with gr.Row():
788
+ inp_text = gr.Textbox(label=i18n("*文本标注文件"),value=r"D:\RVC1006\GPT-SoVITS\raw\xxx.list",interactive=True)
789
+ inp_wav_dir = gr.Textbox(
790
+ label=i18n("*训练集音频文件目录"),
791
+ # value=r"D:\RVC1006\GPT-SoVITS\raw\xxx",
792
+ interactive=True,
793
+ placeholder=i18n("填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名(不是全路径)。如果留空则使用.list文件里的绝对全路径。")
794
+ )
795
+ gr.Markdown(value=i18n("1Aa-文本内容"))
796
+ with gr.Row():
797
+ gpu_numbers1a = gr.Textbox(label=i18n("GPU卡号以-分割,每个卡号一个进程"),value="%s-%s"%(gpus,gpus),interactive=True)
798
+ bert_pretrained_dir = gr.Textbox(label=i18n("预训练的中文BERT模型路径"),value="GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",interactive=False)
799
+ button1a_open = gr.Button(i18n("开启文本获取"), variant="primary",visible=True)
800
+ button1a_close = gr.Button(i18n("终止文本获取进程"), variant="primary",visible=False)
801
+ info1a=gr.Textbox(label=i18n("文本进程输出信息"))
802
+ gr.Markdown(value=i18n("1Ab-SSL自监督特征提取"))
803
+ with gr.Row():
804
+ gpu_numbers1Ba = gr.Textbox(label=i18n("GPU卡号以-分割,每个卡号一个进程"),value="%s-%s"%(gpus,gpus),interactive=True)
805
+ cnhubert_base_dir = gr.Textbox(label=i18n("预训练的SSL模型路径"),value="GPT_SoVITS/pretrained_models/chinese-hubert-base",interactive=False)
806
+ button1b_open = gr.Button(i18n("开启SSL提取"), variant="primary",visible=True)
807
+ button1b_close = gr.Button(i18n("终止SSL提取进程"), variant="primary",visible=False)
808
+ info1b=gr.Textbox(label=i18n("SSL进程输出信息"))
809
+ gr.Markdown(value=i18n("1Ac-语义token提取"))
810
+ with gr.Row():
811
+ gpu_numbers1c = gr.Textbox(label=i18n("GPU卡号以-分割,每个卡号一个进程"),value="%s-%s"%(gpus,gpus),interactive=True)
812
+ button1c_open = gr.Button(i18n("开启语义token提取"), variant="primary",visible=True)
813
+ button1c_close = gr.Button(i18n("终止语义token提取进程"), variant="primary",visible=False)
814
+ info1c=gr.Textbox(label=i18n("语义token提取进程输出信息"))
815
+ gr.Markdown(value=i18n("1Aabc-训练集格式化一键三连"))
816
+ with gr.Row():
817
+ button1abc_open = gr.Button(i18n("开启一键三连"), variant="primary",visible=True)
818
+ button1abc_close = gr.Button(i18n("终止一键三连"), variant="primary",visible=False)
819
+ info1abc=gr.Textbox(label=i18n("一键三连进程输出信息"))
820
+ button1a_open.click(open1a, [inp_text,inp_wav_dir,exp_name,gpu_numbers1a,bert_pretrained_dir], [info1a,button1a_open,button1a_close])
821
+ button1a_close.click(close1a, [], [info1a,button1a_open,button1a_close])
822
+ button1b_open.click(open1b, [inp_text,inp_wav_dir,exp_name,gpu_numbers1Ba,cnhubert_base_dir], [info1b,button1b_open,button1b_close])
823
+ button1b_close.click(close1b, [], [info1b,button1b_open,button1b_close])
824
+ button1c_open.click(open1c, [inp_text,exp_name,gpu_numbers1c,pretrained_s2G], [info1c,button1c_open,button1c_close])
825
+ button1c_close.click(close1c, [], [info1c,button1c_open,button1c_close])
826
+ button1abc_open.click(open1abc, [inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numbers1c,bert_pretrained_dir,cnhubert_base_dir,pretrained_s2G], [info1abc,button1abc_open,button1abc_close])
827
+ button1abc_close.click(close1abc, [], [info1abc,button1abc_open,button1abc_close])
828
+ with gr.TabItem(i18n("1B-微调训练")):
829
+ gr.Markdown(value=i18n("1Ba-SoVITS训练。用于分享的模型文件输出在SoVITS_weights下。"))
830
+ with gr.Row():
831
+ batch_size = gr.Slider(minimum=1,maximum=40,step=1,label=i18n("每张显卡的batch_size"),value=default_batch_size,interactive=True)
832
+ total_epoch = gr.Slider(minimum=1,maximum=25,step=1,label=i18n("总训练轮数total_epoch,不建议太高"),value=8,interactive=True)
833
+ text_low_lr_rate = gr.Slider(minimum=0.2,maximum=0.6,step=0.05,label=i18n("文本模块学习率权重"),value=0.4,interactive=True)
834
+ save_every_epoch = gr.Slider(minimum=1,maximum=25,step=1,label=i18n("保存频率save_every_epoch"),value=4,interactive=True)
835
+ if_save_latest = gr.Checkbox(label=i18n("是否仅保存最新的ckpt文件以节省硬盘空间"), value=True, interactive=True, show_label=True)
836
+ if_save_every_weights = gr.Checkbox(label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"), value=True, interactive=True, show_label=True)
837
+ gpu_numbers1Ba = gr.Textbox(label=i18n("GPU卡号以-分割,每个卡号一个进程"), value="%s" % (gpus), interactive=True)
838
+ with gr.Row():
839
+ button1Ba_open = gr.Button(i18n("开启SoVITS训练"), variant="primary",visible=True)
840
+ button1Ba_close = gr.Button(i18n("终止SoVITS训练"), variant="primary",visible=False)
841
+ info1Ba=gr.Textbox(label=i18n("SoVITS训练进程输出信息"))
842
+ gr.Markdown(value=i18n("1Bb-GPT训练。用于分享的模型文件输出在GPT_weights下。"))
843
+ with gr.Row():
844
+ batch_size1Bb = gr.Slider(minimum=1,maximum=40,step=1,label=i18n("每张显卡的batch_size"),value=default_batch_size,interactive=True)
845
+ total_epoch1Bb = gr.Slider(minimum=2,maximum=50,step=1,label=i18n("总训练轮数total_epoch"),value=15,interactive=True)
846
+ if_dpo = gr.Checkbox(label=i18n("是否开启dpo训练选项(实验性)"), value=False, interactive=True, show_label=True)
847
+ if_save_latest1Bb = gr.Checkbox(label=i18n("是否仅保存最新的ckpt文件以节省硬盘空间"), value=True, interactive=True, show_label=True)
848
+ if_save_every_weights1Bb = gr.Checkbox(label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"), value=True, interactive=True, show_label=True)
849
+ save_every_epoch1Bb = gr.Slider(minimum=1,maximum=50,step=1,label=i18n("保存频率save_every_epoch"),value=5,interactive=True)
850
+ gpu_numbers1Bb = gr.Textbox(label=i18n("GPU卡号以-分割,每个卡号一个进程"), value="%s" % (gpus), interactive=True)
851
+ with gr.Row():
852
+ button1Bb_open = gr.Button(i18n("开启GPT训练"), variant="primary",visible=True)
853
+ button1Bb_close = gr.Button(i18n("终止GPT训练"), variant="primary",visible=False)
854
+ info1Bb=gr.Textbox(label=i18n("GPT训练进程输出信息"))
855
+ button1Ba_open.click(open1Ba, [batch_size,total_epoch,exp_name,text_low_lr_rate,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers1Ba,pretrained_s2G,pretrained_s2D], [info1Ba,button1Ba_open,button1Ba_close])
856
+ button1Ba_close.click(close1Ba, [], [info1Ba,button1Ba_open,button1Ba_close])
857
+ button1Bb_open.click(open1Bb, [batch_size1Bb,total_epoch1Bb,exp_name,if_dpo,if_save_latest1Bb,if_save_every_weights1Bb,save_every_epoch1Bb,gpu_numbers1Bb,pretrained_s1], [info1Bb,button1Bb_open,button1Bb_close])
858
+ button1Bb_close.click(close1Bb, [], [info1Bb,button1Bb_open,button1Bb_close])
859
+ with gr.TabItem(i18n("1C-推理")):
860
+ gr.Markdown(value=i18n("选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。"))
861
+ with gr.Row():
862
+ GPT_dropdown = gr.Dropdown(label=i18n("*GPT模型列表"), choices=sorted(GPT_names,key=custom_sort_key),value=pretrained_gpt_name,interactive=True)
863
+ SoVITS_dropdown = gr.Dropdown(label=i18n("*SoVITS模型列表"), choices=sorted(SoVITS_names,key=custom_sort_key),value=pretrained_sovits_name,interactive=True)
864
+ gpu_number_1C=gr.Textbox(label=i18n("GPU卡号,只能填1个整数"), value=gpus, interactive=True)
865
+ refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary")
866
+ refresh_button.click(fn=change_choices,inputs=[],outputs=[SoVITS_dropdown,GPT_dropdown])
867
+ with gr.Row():
868
+ if_tts = gr.Checkbox(label=i18n("是否开启TTS推理WebUI"), show_label=True)
869
+ tts_info = gr.Textbox(label=i18n("TTS推理WebUI进程输出信息"))
870
+ if_tts.change(change_tts_inference, [if_tts,bert_pretrained_dir,cnhubert_base_dir,gpu_number_1C,GPT_dropdown,SoVITS_dropdown], [tts_info])
871
+ with gr.TabItem(i18n("2-GPT-SoVITS-变声")):gr.Markdown(value=i18n("���工中,请静候佳音"))
872
+ app.queue(concurrency_count=511, max_size=1022).launch(
873
+ server_name="0.0.0.0",
874
+ inbrowser=True,
875
+ share=is_share,
876
+ server_port=webui_port_main,
877
+ quiet=True,
878
+ )