Commit
•
c8d8351
1
Parent(s):
9b54243
GPTSV_FI Update Part 1
Browse files- .gitignore +5 -0
- api.py +734 -559
- api_v2.py +453 -0
- config.py +66 -66
- go-webui.bat +2 -2
- requirements.txt +2 -12
- tools/asr/fasterwhisper_asr.py +23 -19
- tools/asr/funasr_asr.py +4 -3
- tools/cmd-denoise.py +28 -28
- tools/i18n/i18n.py +1 -1
- tools/my_utils.py +31 -31
- tools/slice_audio.py +48 -48
- tools/slicer2.py +261 -261
- tools/subfix_webui.py +2 -2
- tools/uvr5/lib/lib_v5/modelparams/4band_v3.json +53 -53
- tools/uvr5/webui.py +1 -2
- webui.py +878 -878
.gitignore
CHANGED
@@ -10,3 +10,8 @@ reference
|
|
10 |
GPT_weights
|
11 |
SoVITS_weights
|
12 |
TEMP
|
|
|
|
|
|
|
|
|
|
|
|
10 |
GPT_weights
|
11 |
SoVITS_weights
|
12 |
TEMP
|
13 |
+
PortableGit
|
14 |
+
ffmpeg.exe
|
15 |
+
ffprobe.exe
|
16 |
+
tmp_audio
|
17 |
+
trained
|
api.py
CHANGED
@@ -1,559 +1,734 @@
|
|
1 |
-
"""
|
2 |
-
# api.py usage
|
3 |
-
|
4 |
-
` python api.py -dr "123.wav" -dt "一二三。" -dl "zh" `
|
5 |
-
|
6 |
-
## 执行参数:
|
7 |
-
|
8 |
-
`-s` - `SoVITS模型路径, 可在 config.py 中指定`
|
9 |
-
`-g` - `GPT模型路径, 可在 config.py 中指定`
|
10 |
-
|
11 |
-
调用请求缺少参考音频时使用
|
12 |
-
`-dr` - `默认参考音频路径`
|
13 |
-
`-dt` - `默认参考音频文本`
|
14 |
-
`-dl` - `默认参考音频语种, "中文","英文","日文","zh","en","ja"`
|
15 |
-
|
16 |
-
`-d` - `推理设备, "cuda","cpu"`
|
17 |
-
`-a` - `绑定地址, 默认"127.0.0.1"`
|
18 |
-
`-p` - `绑定端口, 默认9880, 可在 config.py 中指定`
|
19 |
-
`-fp` - `覆盖 config.py 使用全精度`
|
20 |
-
`-hp` - `覆盖 config.py 使用半精度`
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
"text": "先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。",
|
52 |
-
"text_language": "zh"
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
"
|
89 |
-
"
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
import
|
121 |
-
|
122 |
-
import
|
123 |
-
|
124 |
-
|
125 |
-
from
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
from
|
130 |
-
from
|
131 |
-
import
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
is_half
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
for
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
if
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
for
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
)
|
471 |
-
|
472 |
-
|
473 |
-
|
474 |
-
|
475 |
-
|
476 |
-
|
477 |
-
|
478 |
-
|
479 |
-
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
|
487 |
-
|
488 |
-
|
489 |
-
|
490 |
-
|
491 |
-
|
492 |
-
|
493 |
-
|
494 |
-
|
495 |
-
|
496 |
-
|
497 |
-
|
498 |
-
|
499 |
-
|
500 |
-
|
501 |
-
|
502 |
-
|
503 |
-
|
504 |
-
|
505 |
-
|
506 |
-
|
507 |
-
|
508 |
-
|
509 |
-
|
510 |
-
|
511 |
-
|
512 |
-
|
513 |
-
|
514 |
-
|
515 |
-
|
516 |
-
|
517 |
-
|
518 |
-
|
519 |
-
|
520 |
-
|
521 |
-
|
522 |
-
|
523 |
-
|
524 |
-
|
525 |
-
|
526 |
-
|
527 |
-
|
528 |
-
|
529 |
-
|
530 |
-
|
531 |
-
)
|
532 |
-
|
533 |
-
|
534 |
-
|
535 |
-
|
536 |
-
|
537 |
-
|
538 |
-
|
539 |
-
|
540 |
-
|
541 |
-
|
542 |
-
|
543 |
-
|
544 |
-
|
545 |
-
|
546 |
-
|
547 |
-
|
548 |
-
|
549 |
-
|
550 |
-
|
551 |
-
|
552 |
-
|
553 |
-
|
554 |
-
|
555 |
-
|
556 |
-
|
557 |
-
|
558 |
-
|
559 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
# api.py usage
|
3 |
+
|
4 |
+
` python api.py -dr "123.wav" -dt "一二三。" -dl "zh" `
|
5 |
+
|
6 |
+
## 执行参数:
|
7 |
+
|
8 |
+
`-s` - `SoVITS模型路径, 可在 config.py 中指定`
|
9 |
+
`-g` - `GPT模型路径, 可在 config.py 中指定`
|
10 |
+
|
11 |
+
调用请求缺少参考音频时使用
|
12 |
+
`-dr` - `默认参考音频路径`
|
13 |
+
`-dt` - `默认参考音频文本`
|
14 |
+
`-dl` - `默认参考音频语种, "中文","英文","日文","zh","en","ja"`
|
15 |
+
|
16 |
+
`-d` - `推理设备, "cuda","cpu"`
|
17 |
+
`-a` - `绑定地址, 默认"127.0.0.1"`
|
18 |
+
`-p` - `绑定端口, 默认9880, 可在 config.py 中指定`
|
19 |
+
`-fp` - `覆盖 config.py 使用全精度`
|
20 |
+
`-hp` - `覆盖 config.py 使用半精度`
|
21 |
+
`-sm` - `流式返回模式, 默认不启用, "close","c", "normal","n", "keepalive","k"`
|
22 |
+
·-mt` - `返回的音频编码格式, 流式默认ogg, 非流式默认wav, "wav", "ogg", "aac"`
|
23 |
+
·-cp` - `文本切分符号设定, 默认为空, 以",.,。"字符串的方式传入`
|
24 |
+
|
25 |
+
`-hb` - `cnhubert路径`
|
26 |
+
`-b` - `bert路径`
|
27 |
+
|
28 |
+
## 调用:
|
29 |
+
|
30 |
+
### 推理
|
31 |
+
|
32 |
+
endpoint: `/`
|
33 |
+
|
34 |
+
使用执行参数指定的参考音频:
|
35 |
+
GET:
|
36 |
+
`http://127.0.0.1:9880?text=先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。&text_language=zh`
|
37 |
+
POST:
|
38 |
+
```json
|
39 |
+
{
|
40 |
+
"text": "先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。",
|
41 |
+
"text_language": "zh"
|
42 |
+
}
|
43 |
+
```
|
44 |
+
|
45 |
+
使用执行参数指定的参考音频并设定分割符号:
|
46 |
+
GET:
|
47 |
+
`http://127.0.0.1:9880?text=先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。&text_language=zh&cut_punc=,。`
|
48 |
+
POST:
|
49 |
+
```json
|
50 |
+
{
|
51 |
+
"text": "先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。",
|
52 |
+
"text_language": "zh",
|
53 |
+
"cut_punc": ",。",
|
54 |
+
}
|
55 |
+
```
|
56 |
+
|
57 |
+
手动指定当次推理所使用的参考音频:
|
58 |
+
GET:
|
59 |
+
`http://127.0.0.1:9880?refer_wav_path=123.wav&prompt_text=一二三。&prompt_language=zh&text=先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。&text_language=zh`
|
60 |
+
POST:
|
61 |
+
```json
|
62 |
+
{
|
63 |
+
"refer_wav_path": "123.wav",
|
64 |
+
"prompt_text": "一二三。",
|
65 |
+
"prompt_language": "zh",
|
66 |
+
"text": "先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。",
|
67 |
+
"text_language": "zh"
|
68 |
+
}
|
69 |
+
```
|
70 |
+
|
71 |
+
RESP:
|
72 |
+
成功: 直接返回 wav 音频流, http code 200
|
73 |
+
失败: 返回包含错误信息的 json, http code 400
|
74 |
+
|
75 |
+
|
76 |
+
### 更换默认参考音频
|
77 |
+
|
78 |
+
endpoint: `/change_refer`
|
79 |
+
|
80 |
+
key与推理端一样
|
81 |
+
|
82 |
+
GET:
|
83 |
+
`http://127.0.0.1:9880/change_refer?refer_wav_path=123.wav&prompt_text=一二三。&prompt_language=zh`
|
84 |
+
POST:
|
85 |
+
```json
|
86 |
+
{
|
87 |
+
"refer_wav_path": "123.wav",
|
88 |
+
"prompt_text": "一二三。",
|
89 |
+
"prompt_language": "zh"
|
90 |
+
}
|
91 |
+
```
|
92 |
+
|
93 |
+
RESP:
|
94 |
+
成功: json, http code 200
|
95 |
+
失败: json, 400
|
96 |
+
|
97 |
+
|
98 |
+
### 命令控制
|
99 |
+
|
100 |
+
endpoint: `/control`
|
101 |
+
|
102 |
+
command:
|
103 |
+
"restart": 重新运行
|
104 |
+
"exit": 结束运行
|
105 |
+
|
106 |
+
GET:
|
107 |
+
`http://127.0.0.1:9880/control?command=restart`
|
108 |
+
POST:
|
109 |
+
```json
|
110 |
+
{
|
111 |
+
"command": "restart"
|
112 |
+
}
|
113 |
+
```
|
114 |
+
|
115 |
+
RESP: 无
|
116 |
+
|
117 |
+
"""
|
118 |
+
|
119 |
+
|
120 |
+
import argparse
|
121 |
+
import os,re
|
122 |
+
import sys
|
123 |
+
import signal
|
124 |
+
import LangSegment
|
125 |
+
from time import time as ttime
|
126 |
+
import torch
|
127 |
+
import librosa
|
128 |
+
import soundfile as sf
|
129 |
+
from fastapi import FastAPI, Request, HTTPException
|
130 |
+
from fastapi.responses import StreamingResponse, JSONResponse
|
131 |
+
import uvicorn
|
132 |
+
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
133 |
+
import numpy as np
|
134 |
+
from feature_extractor import cnhubert
|
135 |
+
from io import BytesIO
|
136 |
+
from module.models import SynthesizerTrn
|
137 |
+
from AR.models.t2s_lightning_module import Text2SemanticLightningModule
|
138 |
+
from text import cleaned_text_to_sequence
|
139 |
+
from text.cleaner import clean_text
|
140 |
+
from module.mel_processing import spectrogram_torch
|
141 |
+
from my_utils import load_audio
|
142 |
+
import config as global_config
|
143 |
+
import logging
|
144 |
+
import subprocess
|
145 |
+
|
146 |
+
|
147 |
+
class DefaultRefer:
|
148 |
+
def __init__(self, path, text, language):
|
149 |
+
self.path = args.default_refer_path
|
150 |
+
self.text = args.default_refer_text
|
151 |
+
self.language = args.default_refer_language
|
152 |
+
|
153 |
+
def is_ready(self) -> bool:
|
154 |
+
return is_full(self.path, self.text, self.language)
|
155 |
+
|
156 |
+
|
157 |
+
def is_empty(*items): # 任意一项不为空返回False
|
158 |
+
for item in items:
|
159 |
+
if item is not None and item != "":
|
160 |
+
return False
|
161 |
+
return True
|
162 |
+
|
163 |
+
|
164 |
+
def is_full(*items): # 任意一项为空返回False
|
165 |
+
for item in items:
|
166 |
+
if item is None or item == "":
|
167 |
+
return False
|
168 |
+
return True
|
169 |
+
|
170 |
+
|
171 |
+
def change_sovits_weights(sovits_path):
|
172 |
+
global vq_model, hps
|
173 |
+
dict_s2 = torch.load(sovits_path, map_location="cpu")
|
174 |
+
hps = dict_s2["config"]
|
175 |
+
hps = DictToAttrRecursive(hps)
|
176 |
+
hps.model.semantic_frame_rate = "25hz"
|
177 |
+
vq_model = SynthesizerTrn(
|
178 |
+
hps.data.filter_length // 2 + 1,
|
179 |
+
hps.train.segment_size // hps.data.hop_length,
|
180 |
+
n_speakers=hps.data.n_speakers,
|
181 |
+
**hps.model
|
182 |
+
)
|
183 |
+
if ("pretrained" not in sovits_path):
|
184 |
+
del vq_model.enc_q
|
185 |
+
if is_half == True:
|
186 |
+
vq_model = vq_model.half().to(device)
|
187 |
+
else:
|
188 |
+
vq_model = vq_model.to(device)
|
189 |
+
vq_model.eval()
|
190 |
+
vq_model.load_state_dict(dict_s2["weight"], strict=False)
|
191 |
+
|
192 |
+
|
193 |
+
def change_gpt_weights(gpt_path):
|
194 |
+
global hz, max_sec, t2s_model, config
|
195 |
+
hz = 50
|
196 |
+
dict_s1 = torch.load(gpt_path, map_location="cpu")
|
197 |
+
config = dict_s1["config"]
|
198 |
+
max_sec = config["data"]["max_sec"]
|
199 |
+
t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)
|
200 |
+
t2s_model.load_state_dict(dict_s1["weight"])
|
201 |
+
if is_half == True:
|
202 |
+
t2s_model = t2s_model.half()
|
203 |
+
t2s_model = t2s_model.to(device)
|
204 |
+
t2s_model.eval()
|
205 |
+
total = sum([param.nelement() for param in t2s_model.parameters()])
|
206 |
+
logger.info("Number of parameter: %.2fM" % (total / 1e6))
|
207 |
+
|
208 |
+
|
209 |
+
def get_bert_feature(text, word2ph):
|
210 |
+
with torch.no_grad():
|
211 |
+
inputs = tokenizer(text, return_tensors="pt")
|
212 |
+
for i in inputs:
|
213 |
+
inputs[i] = inputs[i].to(device) #####输入是long不用管精度问题,精度随bert_model
|
214 |
+
res = bert_model(**inputs, output_hidden_states=True)
|
215 |
+
res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1]
|
216 |
+
assert len(word2ph) == len(text)
|
217 |
+
phone_level_feature = []
|
218 |
+
for i in range(len(word2ph)):
|
219 |
+
repeat_feature = res[i].repeat(word2ph[i], 1)
|
220 |
+
phone_level_feature.append(repeat_feature)
|
221 |
+
phone_level_feature = torch.cat(phone_level_feature, dim=0)
|
222 |
+
# if(is_half==True):phone_level_feature=phone_level_feature.half()
|
223 |
+
return phone_level_feature.T
|
224 |
+
|
225 |
+
|
226 |
+
def clean_text_inf(text, language):
|
227 |
+
phones, word2ph, norm_text = clean_text(text, language)
|
228 |
+
phones = cleaned_text_to_sequence(phones)
|
229 |
+
return phones, word2ph, norm_text
|
230 |
+
|
231 |
+
|
232 |
+
def get_bert_inf(phones, word2ph, norm_text, language):
|
233 |
+
language=language.replace("all_","")
|
234 |
+
if language == "zh":
|
235 |
+
bert = get_bert_feature(norm_text, word2ph).to(device)#.to(dtype)
|
236 |
+
else:
|
237 |
+
bert = torch.zeros(
|
238 |
+
(1024, len(phones)),
|
239 |
+
dtype=torch.float16 if is_half == True else torch.float32,
|
240 |
+
).to(device)
|
241 |
+
|
242 |
+
return bert
|
243 |
+
|
244 |
+
|
245 |
+
def get_phones_and_bert(text,language):
|
246 |
+
if language in {"en","all_zh","all_ja"}:
|
247 |
+
language = language.replace("all_","")
|
248 |
+
if language == "en":
|
249 |
+
LangSegment.setfilters(["en"])
|
250 |
+
formattext = " ".join(tmp["text"] for tmp in LangSegment.getTexts(text))
|
251 |
+
else:
|
252 |
+
# 因无法区别中日文汉字,以用户输入为准
|
253 |
+
formattext = text
|
254 |
+
while " " in formattext:
|
255 |
+
formattext = formattext.replace(" ", " ")
|
256 |
+
phones, word2ph, norm_text = clean_text_inf(formattext, language)
|
257 |
+
if language == "zh":
|
258 |
+
bert = get_bert_feature(norm_text, word2ph).to(device)
|
259 |
+
else:
|
260 |
+
bert = torch.zeros(
|
261 |
+
(1024, len(phones)),
|
262 |
+
dtype=torch.float16 if is_half == True else torch.float32,
|
263 |
+
).to(device)
|
264 |
+
elif language in {"zh", "ja","auto"}:
|
265 |
+
textlist=[]
|
266 |
+
langlist=[]
|
267 |
+
LangSegment.setfilters(["zh","ja","en","ko"])
|
268 |
+
if language == "auto":
|
269 |
+
for tmp in LangSegment.getTexts(text):
|
270 |
+
if tmp["lang"] == "ko":
|
271 |
+
langlist.append("zh")
|
272 |
+
textlist.append(tmp["text"])
|
273 |
+
else:
|
274 |
+
langlist.append(tmp["lang"])
|
275 |
+
textlist.append(tmp["text"])
|
276 |
+
else:
|
277 |
+
for tmp in LangSegment.getTexts(text):
|
278 |
+
if tmp["lang"] == "en":
|
279 |
+
langlist.append(tmp["lang"])
|
280 |
+
else:
|
281 |
+
# 因无法区别中日文汉字,以用户输入为准
|
282 |
+
langlist.append(language)
|
283 |
+
textlist.append(tmp["text"])
|
284 |
+
# logger.info(textlist)
|
285 |
+
# logger.info(langlist)
|
286 |
+
phones_list = []
|
287 |
+
bert_list = []
|
288 |
+
norm_text_list = []
|
289 |
+
for i in range(len(textlist)):
|
290 |
+
lang = langlist[i]
|
291 |
+
phones, word2ph, norm_text = clean_text_inf(textlist[i], lang)
|
292 |
+
bert = get_bert_inf(phones, word2ph, norm_text, lang)
|
293 |
+
phones_list.append(phones)
|
294 |
+
norm_text_list.append(norm_text)
|
295 |
+
bert_list.append(bert)
|
296 |
+
bert = torch.cat(bert_list, dim=1)
|
297 |
+
phones = sum(phones_list, [])
|
298 |
+
norm_text = ''.join(norm_text_list)
|
299 |
+
|
300 |
+
return phones,bert.to(torch.float16 if is_half == True else torch.float32),norm_text
|
301 |
+
|
302 |
+
|
303 |
+
class DictToAttrRecursive:
|
304 |
+
def __init__(self, input_dict):
|
305 |
+
for key, value in input_dict.items():
|
306 |
+
if isinstance(value, dict):
|
307 |
+
# 如果值是字典,递归调用构造函数
|
308 |
+
setattr(self, key, DictToAttrRecursive(value))
|
309 |
+
else:
|
310 |
+
setattr(self, key, value)
|
311 |
+
|
312 |
+
|
313 |
+
def get_spepc(hps, filename):
|
314 |
+
audio = load_audio(filename, int(hps.data.sampling_rate))
|
315 |
+
audio = torch.FloatTensor(audio)
|
316 |
+
audio_norm = audio
|
317 |
+
audio_norm = audio_norm.unsqueeze(0)
|
318 |
+
spec = spectrogram_torch(audio_norm, hps.data.filter_length, hps.data.sampling_rate, hps.data.hop_length,
|
319 |
+
hps.data.win_length, center=False)
|
320 |
+
return spec
|
321 |
+
|
322 |
+
|
323 |
+
def pack_audio(audio_bytes, data, rate):
|
324 |
+
if media_type == "ogg":
|
325 |
+
audio_bytes = pack_ogg(audio_bytes, data, rate)
|
326 |
+
elif media_type == "aac":
|
327 |
+
audio_bytes = pack_aac(audio_bytes, data, rate)
|
328 |
+
else:
|
329 |
+
# wav无法流式, 先暂存raw
|
330 |
+
audio_bytes = pack_raw(audio_bytes, data, rate)
|
331 |
+
|
332 |
+
return audio_bytes
|
333 |
+
|
334 |
+
|
335 |
+
def pack_ogg(audio_bytes, data, rate):
|
336 |
+
with sf.SoundFile(audio_bytes, mode='w', samplerate=rate, channels=1, format='ogg') as audio_file:
|
337 |
+
audio_file.write(data)
|
338 |
+
|
339 |
+
return audio_bytes
|
340 |
+
|
341 |
+
|
342 |
+
def pack_raw(audio_bytes, data, rate):
|
343 |
+
audio_bytes.write(data.tobytes())
|
344 |
+
|
345 |
+
return audio_bytes
|
346 |
+
|
347 |
+
|
348 |
+
def pack_wav(audio_bytes, rate):
|
349 |
+
data = np.frombuffer(audio_bytes.getvalue(),dtype=np.int16)
|
350 |
+
wav_bytes = BytesIO()
|
351 |
+
sf.write(wav_bytes, data, rate, format='wav')
|
352 |
+
|
353 |
+
return wav_bytes
|
354 |
+
|
355 |
+
|
356 |
+
def pack_aac(audio_bytes, data, rate):
|
357 |
+
process = subprocess.Popen([
|
358 |
+
'ffmpeg',
|
359 |
+
'-f', 's16le', # 输入16位有符号小端整数PCM
|
360 |
+
'-ar', str(rate), # 设置采样率
|
361 |
+
'-ac', '1', # 单声道
|
362 |
+
'-i', 'pipe:0', # 从管道读取输入
|
363 |
+
'-c:a', 'aac', # 音频编码器为AAC
|
364 |
+
'-b:a', '192k', # 比特率
|
365 |
+
'-vn', # 不包含视频
|
366 |
+
'-f', 'adts', # 输出AAC数据流格式
|
367 |
+
'pipe:1' # 将输出写入管道
|
368 |
+
], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
369 |
+
out, _ = process.communicate(input=data.tobytes())
|
370 |
+
audio_bytes.write(out)
|
371 |
+
|
372 |
+
return audio_bytes
|
373 |
+
|
374 |
+
|
375 |
+
def read_clean_buffer(audio_bytes):
|
376 |
+
audio_chunk = audio_bytes.getvalue()
|
377 |
+
audio_bytes.truncate(0)
|
378 |
+
audio_bytes.seek(0)
|
379 |
+
|
380 |
+
return audio_bytes, audio_chunk
|
381 |
+
|
382 |
+
|
383 |
+
def cut_text(text, punc):
|
384 |
+
punc_list = [p for p in punc if p in {",", ".", ";", "?", "!", "、", ",", "。", "?", "!", ";", ":", "…"}]
|
385 |
+
if len(punc_list) > 0:
|
386 |
+
punds = r"[" + "".join(punc_list) + r"]"
|
387 |
+
text = text.strip("\n")
|
388 |
+
items = re.split(f"({punds})", text)
|
389 |
+
mergeitems = ["".join(group) for group in zip(items[::2], items[1::2])]
|
390 |
+
# 在句子不存在符号或句尾无符号的时候保证文本完整
|
391 |
+
if len(items)%2 == 1:
|
392 |
+
mergeitems.append(items[-1])
|
393 |
+
text = "\n".join(mergeitems)
|
394 |
+
|
395 |
+
while "\n\n" in text:
|
396 |
+
text = text.replace("\n\n", "\n")
|
397 |
+
|
398 |
+
return text
|
399 |
+
|
400 |
+
|
401 |
+
def only_punc(text):
|
402 |
+
return not any(t.isalnum() or t.isalpha() for t in text)
|
403 |
+
|
404 |
+
|
405 |
+
def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language):
|
406 |
+
t0 = ttime()
|
407 |
+
prompt_text = prompt_text.strip("\n")
|
408 |
+
prompt_language, text = prompt_language, text.strip("\n")
|
409 |
+
zero_wav = np.zeros(int(hps.data.sampling_rate * 0.3), dtype=np.float16 if is_half == True else np.float32)
|
410 |
+
with torch.no_grad():
|
411 |
+
wav16k, sr = librosa.load(ref_wav_path, sr=16000)
|
412 |
+
wav16k = torch.from_numpy(wav16k)
|
413 |
+
zero_wav_torch = torch.from_numpy(zero_wav)
|
414 |
+
if (is_half == True):
|
415 |
+
wav16k = wav16k.half().to(device)
|
416 |
+
zero_wav_torch = zero_wav_torch.half().to(device)
|
417 |
+
else:
|
418 |
+
wav16k = wav16k.to(device)
|
419 |
+
zero_wav_torch = zero_wav_torch.to(device)
|
420 |
+
wav16k = torch.cat([wav16k, zero_wav_torch])
|
421 |
+
ssl_content = ssl_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(1, 2) # .float()
|
422 |
+
codes = vq_model.extract_latent(ssl_content)
|
423 |
+
prompt_semantic = codes[0, 0]
|
424 |
+
t1 = ttime()
|
425 |
+
prompt_language = dict_language[prompt_language.lower()]
|
426 |
+
text_language = dict_language[text_language.lower()]
|
427 |
+
phones1, bert1, norm_text1 = get_phones_and_bert(prompt_text, prompt_language)
|
428 |
+
texts = text.split("\n")
|
429 |
+
audio_bytes = BytesIO()
|
430 |
+
|
431 |
+
for text in texts:
|
432 |
+
# 简单防止纯符号引发参考音频泄露
|
433 |
+
if only_punc(text):
|
434 |
+
continue
|
435 |
+
|
436 |
+
audio_opt = []
|
437 |
+
phones2, bert2, norm_text2 = get_phones_and_bert(text, text_language)
|
438 |
+
bert = torch.cat([bert1, bert2], 1)
|
439 |
+
|
440 |
+
all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0)
|
441 |
+
bert = bert.to(device).unsqueeze(0)
|
442 |
+
all_phoneme_len = torch.tensor([all_phoneme_ids.shape[-1]]).to(device)
|
443 |
+
prompt = prompt_semantic.unsqueeze(0).to(device)
|
444 |
+
t2 = ttime()
|
445 |
+
with torch.no_grad():
|
446 |
+
# pred_semantic = t2s_model.model.infer(
|
447 |
+
pred_semantic, idx = t2s_model.model.infer_panel(
|
448 |
+
all_phoneme_ids,
|
449 |
+
all_phoneme_len,
|
450 |
+
prompt,
|
451 |
+
bert,
|
452 |
+
# prompt_phone_len=ph_offset,
|
453 |
+
top_k=config['inference']['top_k'],
|
454 |
+
early_stop_num=hz * max_sec)
|
455 |
+
t3 = ttime()
|
456 |
+
# print(pred_semantic.shape,idx)
|
457 |
+
pred_semantic = pred_semantic[:, -idx:].unsqueeze(0) # .unsqueeze(0)#mq要多unsqueeze一次
|
458 |
+
refer = get_spepc(hps, ref_wav_path) # .to(device)
|
459 |
+
if (is_half == True):
|
460 |
+
refer = refer.half().to(device)
|
461 |
+
else:
|
462 |
+
refer = refer.to(device)
|
463 |
+
# audio = vq_model.decode(pred_semantic, all_phoneme_ids, refer).detach().cpu().numpy()[0, 0]
|
464 |
+
audio = \
|
465 |
+
vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0),
|
466 |
+
refer).detach().cpu().numpy()[
|
467 |
+
0, 0] ###试试重建不带上prompt部分
|
468 |
+
audio_opt.append(audio)
|
469 |
+
audio_opt.append(zero_wav)
|
470 |
+
t4 = ttime()
|
471 |
+
audio_bytes = pack_audio(audio_bytes,(np.concatenate(audio_opt, 0) * 32768).astype(np.int16),hps.data.sampling_rate)
|
472 |
+
# logger.info("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
|
473 |
+
if stream_mode == "normal":
|
474 |
+
audio_bytes, audio_chunk = read_clean_buffer(audio_bytes)
|
475 |
+
yield audio_chunk
|
476 |
+
|
477 |
+
if not stream_mode == "normal":
|
478 |
+
if media_type == "wav":
|
479 |
+
audio_bytes = pack_wav(audio_bytes,hps.data.sampling_rate)
|
480 |
+
yield audio_bytes.getvalue()
|
481 |
+
|
482 |
+
|
483 |
+
|
484 |
+
def handle_control(command):
|
485 |
+
if command == "restart":
|
486 |
+
os.execl(g_config.python_exec, g_config.python_exec, *sys.argv)
|
487 |
+
elif command == "exit":
|
488 |
+
os.kill(os.getpid(), signal.SIGTERM)
|
489 |
+
exit(0)
|
490 |
+
|
491 |
+
|
492 |
+
def handle_change(path, text, language):
|
493 |
+
if is_empty(path, text, language):
|
494 |
+
return JSONResponse({"code": 400, "message": '缺少任意一项以下参数: "path", "text", "language"'}, status_code=400)
|
495 |
+
|
496 |
+
if path != "" or path is not None:
|
497 |
+
default_refer.path = path
|
498 |
+
if text != "" or text is not None:
|
499 |
+
default_refer.text = text
|
500 |
+
if language != "" or language is not None:
|
501 |
+
default_refer.language = language
|
502 |
+
|
503 |
+
logger.info(f"当前默认参考音频路径: {default_refer.path}")
|
504 |
+
logger.info(f"当前默认参考音频文本: {default_refer.text}")
|
505 |
+
logger.info(f"当前默认参考音频语种: {default_refer.language}")
|
506 |
+
logger.info(f"is_ready: {default_refer.is_ready()}")
|
507 |
+
|
508 |
+
|
509 |
+
return JSONResponse({"code": 0, "message": "Success"}, status_code=200)
|
510 |
+
|
511 |
+
|
512 |
+
def handle(refer_wav_path, prompt_text, prompt_language, text, text_language, cut_punc):
|
513 |
+
if (
|
514 |
+
refer_wav_path == "" or refer_wav_path is None
|
515 |
+
or prompt_text == "" or prompt_text is None
|
516 |
+
or prompt_language == "" or prompt_language is None
|
517 |
+
):
|
518 |
+
refer_wav_path, prompt_text, prompt_language = (
|
519 |
+
default_refer.path,
|
520 |
+
default_refer.text,
|
521 |
+
default_refer.language,
|
522 |
+
)
|
523 |
+
if not default_refer.is_ready():
|
524 |
+
return JSONResponse({"code": 400, "message": "未指定参考音频且接口无预设"}, status_code=400)
|
525 |
+
|
526 |
+
if cut_punc == None:
|
527 |
+
text = cut_text(text,default_cut_punc)
|
528 |
+
else:
|
529 |
+
text = cut_text(text,cut_punc)
|
530 |
+
|
531 |
+
return StreamingResponse(get_tts_wav(refer_wav_path, prompt_text, prompt_language, text, text_language), media_type="audio/"+media_type)
|
532 |
+
|
533 |
+
|
534 |
+
|
535 |
+
|
536 |
+
# --------------------------------
|
537 |
+
# 初始化部分
|
538 |
+
# --------------------------------
|
539 |
+
now_dir = os.getcwd()
|
540 |
+
sys.path.append(now_dir)
|
541 |
+
sys.path.append("%s/GPT_SoVITS" % (now_dir))
|
542 |
+
|
543 |
+
dict_language = {
|
544 |
+
"中文": "all_zh",
|
545 |
+
"英文": "en",
|
546 |
+
"日文": "all_ja",
|
547 |
+
"中英混合": "zh",
|
548 |
+
"日英混合": "ja",
|
549 |
+
"多语种混合": "auto", #多语种启动切分识别语种
|
550 |
+
"all_zh": "all_zh",
|
551 |
+
"en": "en",
|
552 |
+
"all_ja": "all_ja",
|
553 |
+
"zh": "zh",
|
554 |
+
"ja": "ja",
|
555 |
+
"auto": "auto",
|
556 |
+
}
|
557 |
+
|
558 |
+
# logger
|
559 |
+
logging.config.dictConfig(uvicorn.config.LOGGING_CONFIG)
|
560 |
+
logger = logging.getLogger('uvicorn')
|
561 |
+
|
562 |
+
# 获取配置
|
563 |
+
g_config = global_config.Config()
|
564 |
+
|
565 |
+
# 获取参数
|
566 |
+
parser = argparse.ArgumentParser(description="GPT-SoVITS api")
|
567 |
+
|
568 |
+
parser.add_argument("-s", "--sovits_path", type=str, default=g_config.sovits_path, help="SoVITS模型路径")
|
569 |
+
parser.add_argument("-g", "--gpt_path", type=str, default=g_config.gpt_path, help="GPT模型路径")
|
570 |
+
parser.add_argument("-dr", "--default_refer_path", type=str, default="", help="默认参考音频路径")
|
571 |
+
parser.add_argument("-dt", "--default_refer_text", type=str, default="", help="默认参考音频文本")
|
572 |
+
parser.add_argument("-dl", "--default_refer_language", type=str, default="", help="默认参考音频语种")
|
573 |
+
parser.add_argument("-d", "--device", type=str, default=g_config.infer_device, help="cuda / cpu")
|
574 |
+
parser.add_argument("-a", "--bind_addr", type=str, default="0.0.0.0", help="default: 0.0.0.0")
|
575 |
+
parser.add_argument("-p", "--port", type=int, default=g_config.api_port, help="default: 9880")
|
576 |
+
parser.add_argument("-fp", "--full_precision", action="store_true", default=False, help="覆盖config.is_half为False, 使用全精度")
|
577 |
+
parser.add_argument("-hp", "--half_precision", action="store_true", default=False, help="覆盖config.is_half为True, 使用半精度")
|
578 |
+
# bool值的用法为 `python ./api.py -fp ...`
|
579 |
+
# 此时 full_precision==True, half_precision==False
|
580 |
+
parser.add_argument("-sm", "--stream_mode", type=str, default="close", help="流式返回模式, close / normal / keepalive")
|
581 |
+
parser.add_argument("-mt", "--media_type", type=str, default="wav", help="音频编码格式, wav / ogg / aac")
|
582 |
+
parser.add_argument("-cp", "--cut_punc", type=str, default="", help="文本切分符号设定, 符号范围,.;?!、,。?!;:…")
|
583 |
+
# 切割常用分句符为 `python ./api.py -cp ".?!。?!"`
|
584 |
+
parser.add_argument("-hb", "--hubert_path", type=str, default=g_config.cnhubert_path, help="覆盖config.cnhubert_path")
|
585 |
+
parser.add_argument("-b", "--bert_path", type=str, default=g_config.bert_path, help="覆盖config.bert_path")
|
586 |
+
|
587 |
+
args = parser.parse_args()
|
588 |
+
sovits_path = args.sovits_path
|
589 |
+
gpt_path = args.gpt_path
|
590 |
+
device = args.device
|
591 |
+
port = args.port
|
592 |
+
host = args.bind_addr
|
593 |
+
cnhubert_base_path = args.hubert_path
|
594 |
+
bert_path = args.bert_path
|
595 |
+
default_cut_punc = args.cut_punc
|
596 |
+
|
597 |
+
# 应用参数配置
|
598 |
+
default_refer = DefaultRefer(args.default_refer_path, args.default_refer_text, args.default_refer_language)
|
599 |
+
|
600 |
+
# 模型路径检查
|
601 |
+
if sovits_path == "":
|
602 |
+
sovits_path = g_config.pretrained_sovits_path
|
603 |
+
logger.warn(f"未指定SoVITS模型路径, fallback后当前值: {sovits_path}")
|
604 |
+
if gpt_path == "":
|
605 |
+
gpt_path = g_config.pretrained_gpt_path
|
606 |
+
logger.warn(f"未指定GPT模型路径, fallback后当前值: {gpt_path}")
|
607 |
+
|
608 |
+
# 指定默认参考音频, 调用方 未提供/未给全 参考音频参数时使用
|
609 |
+
if default_refer.path == "" or default_refer.text == "" or default_refer.language == "":
|
610 |
+
default_refer.path, default_refer.text, default_refer.language = "", "", ""
|
611 |
+
logger.info("未指定默认参考音频")
|
612 |
+
else:
|
613 |
+
logger.info(f"默认参考音频路径: {default_refer.path}")
|
614 |
+
logger.info(f"默认参考音频文本: {default_refer.text}")
|
615 |
+
logger.info(f"默认参考音频语种: {default_refer.language}")
|
616 |
+
|
617 |
+
# 获取半精度
|
618 |
+
is_half = g_config.is_half
|
619 |
+
if args.full_precision:
|
620 |
+
is_half = False
|
621 |
+
if args.half_precision:
|
622 |
+
is_half = True
|
623 |
+
if args.full_precision and args.half_precision:
|
624 |
+
is_half = g_config.is_half # 炒饭fallback
|
625 |
+
logger.info(f"半精: {is_half}")
|
626 |
+
|
627 |
+
# 流式返回模式
|
628 |
+
if args.stream_mode.lower() in ["normal","n"]:
|
629 |
+
stream_mode = "normal"
|
630 |
+
logger.info("流式返回已开启")
|
631 |
+
else:
|
632 |
+
stream_mode = "close"
|
633 |
+
|
634 |
+
# 音频编码格式
|
635 |
+
if args.media_type.lower() in ["aac","ogg"]:
|
636 |
+
media_type = args.media_type.lower()
|
637 |
+
elif stream_mode == "close":
|
638 |
+
media_type = "wav"
|
639 |
+
else:
|
640 |
+
media_type = "ogg"
|
641 |
+
logger.info(f"编码格式: {media_type}")
|
642 |
+
|
643 |
+
# 初始化模型
|
644 |
+
cnhubert.cnhubert_base_path = cnhubert_base_path
|
645 |
+
tokenizer = AutoTokenizer.from_pretrained(bert_path)
|
646 |
+
bert_model = AutoModelForMaskedLM.from_pretrained(bert_path)
|
647 |
+
ssl_model = cnhubert.get_model()
|
648 |
+
if is_half:
|
649 |
+
bert_model = bert_model.half().to(device)
|
650 |
+
ssl_model = ssl_model.half().to(device)
|
651 |
+
else:
|
652 |
+
bert_model = bert_model.to(device)
|
653 |
+
ssl_model = ssl_model.to(device)
|
654 |
+
change_sovits_weights(sovits_path)
|
655 |
+
change_gpt_weights(gpt_path)
|
656 |
+
|
657 |
+
|
658 |
+
|
659 |
+
|
660 |
+
# --------------------------------
|
661 |
+
# 接口部分
|
662 |
+
# --------------------------------
|
663 |
+
app = FastAPI()
|
664 |
+
|
665 |
+
@app.post("/set_model")
|
666 |
+
async def set_model(request: Request):
|
667 |
+
json_post_raw = await request.json()
|
668 |
+
global gpt_path
|
669 |
+
gpt_path=json_post_raw.get("gpt_model_path")
|
670 |
+
global sovits_path
|
671 |
+
sovits_path=json_post_raw.get("sovits_model_path")
|
672 |
+
logger.info("gptpath"+gpt_path+";vitspath"+sovits_path)
|
673 |
+
change_sovits_weights(sovits_path)
|
674 |
+
change_gpt_weights(gpt_path)
|
675 |
+
return "ok"
|
676 |
+
|
677 |
+
|
678 |
+
@app.post("/control")
|
679 |
+
async def control(request: Request):
|
680 |
+
json_post_raw = await request.json()
|
681 |
+
return handle_control(json_post_raw.get("command"))
|
682 |
+
|
683 |
+
|
684 |
+
@app.get("/control")
|
685 |
+
async def control(command: str = None):
|
686 |
+
return handle_control(command)
|
687 |
+
|
688 |
+
|
689 |
+
@app.post("/change_refer")
|
690 |
+
async def change_refer(request: Request):
|
691 |
+
json_post_raw = await request.json()
|
692 |
+
return handle_change(
|
693 |
+
json_post_raw.get("refer_wav_path"),
|
694 |
+
json_post_raw.get("prompt_text"),
|
695 |
+
json_post_raw.get("prompt_language")
|
696 |
+
)
|
697 |
+
|
698 |
+
|
699 |
+
@app.get("/change_refer")
|
700 |
+
async def change_refer(
|
701 |
+
refer_wav_path: str = None,
|
702 |
+
prompt_text: str = None,
|
703 |
+
prompt_language: str = None
|
704 |
+
):
|
705 |
+
return handle_change(refer_wav_path, prompt_text, prompt_language)
|
706 |
+
|
707 |
+
|
708 |
+
@app.post("/")
|
709 |
+
async def tts_endpoint(request: Request):
|
710 |
+
json_post_raw = await request.json()
|
711 |
+
return handle(
|
712 |
+
json_post_raw.get("refer_wav_path"),
|
713 |
+
json_post_raw.get("prompt_text"),
|
714 |
+
json_post_raw.get("prompt_language"),
|
715 |
+
json_post_raw.get("text"),
|
716 |
+
json_post_raw.get("text_language"),
|
717 |
+
json_post_raw.get("cut_punc"),
|
718 |
+
)
|
719 |
+
|
720 |
+
|
721 |
+
@app.get("/")
|
722 |
+
async def tts_endpoint(
|
723 |
+
refer_wav_path: str = None,
|
724 |
+
prompt_text: str = None,
|
725 |
+
prompt_language: str = None,
|
726 |
+
text: str = None,
|
727 |
+
text_language: str = None,
|
728 |
+
cut_punc: str = None,
|
729 |
+
):
|
730 |
+
return handle(refer_wav_path, prompt_text, prompt_language, text, text_language, cut_punc)
|
731 |
+
|
732 |
+
|
733 |
+
if __name__ == "__main__":
|
734 |
+
uvicorn.run(app, host=host, port=port, workers=1)
|
api_v2.py
ADDED
@@ -0,0 +1,453 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
# WebAPI文档
|
3 |
+
|
4 |
+
` python api_v2.py -a 127.0.0.1 -p 9880 -c GPT_SoVITS/configs/tts_infer.yaml `
|
5 |
+
|
6 |
+
## 执行参数:
|
7 |
+
`-a` - `绑定地址, 默认"127.0.0.1"`
|
8 |
+
`-p` - `绑定端口, 默认9880`
|
9 |
+
`-c` - `TTS配置文件路径, 默认"GPT_SoVITS/configs/tts_infer.yaml"`
|
10 |
+
|
11 |
+
## 调用:
|
12 |
+
|
13 |
+
### 推理
|
14 |
+
|
15 |
+
endpoint: `/tts`
|
16 |
+
GET:
|
17 |
+
```
|
18 |
+
http://127.0.0.1:9880/tts?text=先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。&text_lang=zh&ref_audio_path=archive_jingyuan_1.wav&prompt_lang=zh&prompt_text=我是「罗浮」云骑将军景元。不必拘谨,「将军」只是一时的身份,你称呼我景元便可&text_split_method=cut5&batch_size=1&media_type=wav&streaming_mode=true
|
19 |
+
```
|
20 |
+
|
21 |
+
POST:
|
22 |
+
```json
|
23 |
+
{
|
24 |
+
"text": "", # str.(required) text to be synthesized
|
25 |
+
"text_lang": "", # str.(required) language of the text to be synthesized
|
26 |
+
"ref_audio_path": "", # str.(required) reference audio path.
|
27 |
+
"prompt_text": "", # str.(optional) prompt text for the reference audio
|
28 |
+
"prompt_lang": "", # str.(required) language of the prompt text for the reference audio
|
29 |
+
"top_k": 5, # int.(optional) top k sampling
|
30 |
+
"top_p": 1, # float.(optional) top p sampling
|
31 |
+
"temperature": 1, # float.(optional) temperature for sampling
|
32 |
+
"text_split_method": "cut5", # str.(optional) text split method, see text_segmentation_method.py for details.
|
33 |
+
"batch_size": 1, # int.(optional) batch size for inference
|
34 |
+
"batch_threshold": 0.75, # float.(optional) threshold for batch splitting.
|
35 |
+
"split_bucket": true, # bool.(optional) whether to split the batch into multiple buckets.
|
36 |
+
"speed_factor":1.0, # float.(optional) control the speed of the synthesized audio.
|
37 |
+
"fragment_interval":0.3, # float.(optional) to control the interval of the audio fragment.
|
38 |
+
"seed": -1, # int.(optional) random seed for reproducibility.
|
39 |
+
"media_type": "wav", # str.(optional) media type of the output audio, support "wav", "raw", "ogg", "aac".
|
40 |
+
"streaming_mode": false, # bool.(optional) whether to return a streaming response.
|
41 |
+
"parallel_infer": True, # bool.(optional) whether to use parallel inference.
|
42 |
+
"repetition_penalty": 1.35 # float.(optional) repetition penalty for T2S model.
|
43 |
+
}
|
44 |
+
```
|
45 |
+
|
46 |
+
RESP:
|
47 |
+
成功: 直接返回 wav 音频流, http code 200
|
48 |
+
失败: 返回包含错误信息的 json, http code 400
|
49 |
+
|
50 |
+
### 命令控制
|
51 |
+
|
52 |
+
endpoint: `/control`
|
53 |
+
|
54 |
+
command:
|
55 |
+
"restart": 重新运行
|
56 |
+
"exit": 结束运行
|
57 |
+
|
58 |
+
GET:
|
59 |
+
```
|
60 |
+
http://127.0.0.1:9880/control?command=restart
|
61 |
+
```
|
62 |
+
POST:
|
63 |
+
```json
|
64 |
+
{
|
65 |
+
"command": "restart"
|
66 |
+
}
|
67 |
+
```
|
68 |
+
|
69 |
+
RESP: 无
|
70 |
+
|
71 |
+
|
72 |
+
### 切换GPT模型
|
73 |
+
|
74 |
+
endpoint: `/set_gpt_weights`
|
75 |
+
|
76 |
+
GET:
|
77 |
+
```
|
78 |
+
http://127.0.0.1:9880/set_gpt_weights?weights_path=GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
|
79 |
+
```
|
80 |
+
RESP:
|
81 |
+
成功: 返回"success", http code 200
|
82 |
+
失败: 返回包含错误信息的 json, http code 400
|
83 |
+
|
84 |
+
|
85 |
+
### 切换Sovits模型
|
86 |
+
|
87 |
+
endpoint: `/set_sovits_weights`
|
88 |
+
|
89 |
+
GET:
|
90 |
+
```
|
91 |
+
http://127.0.0.1:9880/set_sovits_weights?weights_path=GPT_SoVITS/pretrained_models/s2G488k.pth
|
92 |
+
```
|
93 |
+
|
94 |
+
RESP:
|
95 |
+
成功: 返回"success", http code 200
|
96 |
+
失败: 返回包含错误信息的 json, http code 400
|
97 |
+
|
98 |
+
"""
|
99 |
+
import os
|
100 |
+
import sys
|
101 |
+
import traceback
|
102 |
+
from typing import Generator
|
103 |
+
|
104 |
+
now_dir = os.getcwd()
|
105 |
+
sys.path.append(now_dir)
|
106 |
+
sys.path.append("%s/GPT_SoVITS" % (now_dir))
|
107 |
+
|
108 |
+
import argparse
|
109 |
+
import subprocess
|
110 |
+
import wave
|
111 |
+
import signal
|
112 |
+
import numpy as np
|
113 |
+
import soundfile as sf
|
114 |
+
from fastapi import FastAPI, Request, HTTPException, Response
|
115 |
+
from fastapi.responses import StreamingResponse, JSONResponse
|
116 |
+
from fastapi import FastAPI, UploadFile, File
|
117 |
+
import uvicorn
|
118 |
+
from io import BytesIO
|
119 |
+
from tools.i18n.i18n import I18nAuto
|
120 |
+
from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config
|
121 |
+
from GPT_SoVITS.TTS_infer_pack.text_segmentation_method import get_method_names as get_cut_method_names
|
122 |
+
from fastapi.responses import StreamingResponse
|
123 |
+
from pydantic import BaseModel
|
124 |
+
# print(sys.path)
|
125 |
+
i18n = I18nAuto()
|
126 |
+
cut_method_names = get_cut_method_names()
|
127 |
+
|
128 |
+
parser = argparse.ArgumentParser(description="GPT-SoVITS api")
|
129 |
+
parser.add_argument("-c", "--tts_config", type=str, default="GPT_SoVITS/configs/tts_infer.yaml", help="tts_infer路径")
|
130 |
+
parser.add_argument("-a", "--bind_addr", type=str, default="127.0.0.1", help="default: 127.0.0.1")
|
131 |
+
parser.add_argument("-p", "--port", type=int, default="9880", help="default: 9880")
|
132 |
+
args = parser.parse_args()
|
133 |
+
config_path = args.tts_config
|
134 |
+
# device = args.device
|
135 |
+
port = args.port
|
136 |
+
host = args.bind_addr
|
137 |
+
argv = sys.argv
|
138 |
+
|
139 |
+
if config_path in [None, ""]:
|
140 |
+
config_path = "GPT-SoVITS/configs/tts_infer.yaml"
|
141 |
+
|
142 |
+
tts_config = TTS_Config(config_path)
|
143 |
+
tts_pipeline = TTS(tts_config)
|
144 |
+
|
145 |
+
APP = FastAPI()
|
146 |
+
class TTS_Request(BaseModel):
|
147 |
+
text: str = None
|
148 |
+
text_lang: str = None
|
149 |
+
ref_audio_path: str = None
|
150 |
+
prompt_lang: str = None
|
151 |
+
prompt_text: str = ""
|
152 |
+
top_k:int = 5
|
153 |
+
top_p:float = 1
|
154 |
+
temperature:float = 1
|
155 |
+
text_split_method:str = "cut5"
|
156 |
+
batch_size:int = 1
|
157 |
+
batch_threshold:float = 0.75
|
158 |
+
split_bucket:bool = True
|
159 |
+
speed_factor:float = 1.0
|
160 |
+
fragment_interval:float = 0.3
|
161 |
+
seed:int = -1
|
162 |
+
media_type:str = "wav"
|
163 |
+
streaming_mode:bool = False
|
164 |
+
parallel_infer:bool = True
|
165 |
+
repetition_penalty:float = 1.35
|
166 |
+
|
167 |
+
### modify from https://github.com/RVC-Boss/GPT-SoVITS/pull/894/files
|
168 |
+
def pack_ogg(io_buffer:BytesIO, data:np.ndarray, rate:int):
|
169 |
+
with sf.SoundFile(io_buffer, mode='w', samplerate=rate, channels=1, format='ogg') as audio_file:
|
170 |
+
audio_file.write(data)
|
171 |
+
return io_buffer
|
172 |
+
|
173 |
+
|
174 |
+
def pack_raw(io_buffer:BytesIO, data:np.ndarray, rate:int):
|
175 |
+
io_buffer.write(data.tobytes())
|
176 |
+
return io_buffer
|
177 |
+
|
178 |
+
|
179 |
+
def pack_wav(io_buffer:BytesIO, data:np.ndarray, rate:int):
|
180 |
+
io_buffer = BytesIO()
|
181 |
+
sf.write(io_buffer, data, rate, format='wav')
|
182 |
+
return io_buffer
|
183 |
+
|
184 |
+
def pack_aac(io_buffer:BytesIO, data:np.ndarray, rate:int):
|
185 |
+
process = subprocess.Popen([
|
186 |
+
'ffmpeg',
|
187 |
+
'-f', 's16le', # 输入16位有符号小端整数PCM
|
188 |
+
'-ar', str(rate), # 设置采样率
|
189 |
+
'-ac', '1', # 单声道
|
190 |
+
'-i', 'pipe:0', # 从管道读取输入
|
191 |
+
'-c:a', 'aac', # 音频编码器为AAC
|
192 |
+
'-b:a', '192k', # 比特率
|
193 |
+
'-vn', # 不包含视频
|
194 |
+
'-f', 'adts', # 输出AAC数据流格式
|
195 |
+
'pipe:1' # 将输出写入管道
|
196 |
+
], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
197 |
+
out, _ = process.communicate(input=data.tobytes())
|
198 |
+
io_buffer.write(out)
|
199 |
+
return io_buffer
|
200 |
+
|
201 |
+
def pack_audio(io_buffer:BytesIO, data:np.ndarray, rate:int, media_type:str):
|
202 |
+
if media_type == "ogg":
|
203 |
+
io_buffer = pack_ogg(io_buffer, data, rate)
|
204 |
+
elif media_type == "aac":
|
205 |
+
io_buffer = pack_aac(io_buffer, data, rate)
|
206 |
+
elif media_type == "wav":
|
207 |
+
io_buffer = pack_wav(io_buffer, data, rate)
|
208 |
+
else:
|
209 |
+
io_buffer = pack_raw(io_buffer, data, rate)
|
210 |
+
io_buffer.seek(0)
|
211 |
+
return io_buffer
|
212 |
+
|
213 |
+
|
214 |
+
|
215 |
+
# from https://huggingface.co/spaces/coqui/voice-chat-with-mistral/blob/main/app.py
|
216 |
+
def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=32000):
|
217 |
+
# This will create a wave header then append the frame input
|
218 |
+
# It should be first on a streaming wav file
|
219 |
+
# Other frames better should not have it (else you will hear some artifacts each chunk start)
|
220 |
+
wav_buf = BytesIO()
|
221 |
+
with wave.open(wav_buf, "wb") as vfout:
|
222 |
+
vfout.setnchannels(channels)
|
223 |
+
vfout.setsampwidth(sample_width)
|
224 |
+
vfout.setframerate(sample_rate)
|
225 |
+
vfout.writeframes(frame_input)
|
226 |
+
|
227 |
+
wav_buf.seek(0)
|
228 |
+
return wav_buf.read()
|
229 |
+
|
230 |
+
|
231 |
+
def handle_control(command:str):
|
232 |
+
if command == "restart":
|
233 |
+
os.execl(sys.executable, sys.executable, *argv)
|
234 |
+
elif command == "exit":
|
235 |
+
os.kill(os.getpid(), signal.SIGTERM)
|
236 |
+
exit(0)
|
237 |
+
|
238 |
+
|
239 |
+
def check_params(req:dict):
|
240 |
+
text:str = req.get("text", "")
|
241 |
+
text_lang:str = req.get("text_lang", "")
|
242 |
+
ref_audio_path:str = req.get("ref_audio_path", "")
|
243 |
+
streaming_mode:bool = req.get("streaming_mode", False)
|
244 |
+
media_type:str = req.get("media_type", "wav")
|
245 |
+
prompt_lang:str = req.get("prompt_lang", "")
|
246 |
+
text_split_method:str = req.get("text_split_method", "cut5")
|
247 |
+
|
248 |
+
if ref_audio_path in [None, ""]:
|
249 |
+
return JSONResponse(status_code=400, content={"message": "ref_audio_path is required"})
|
250 |
+
if text in [None, ""]:
|
251 |
+
return JSONResponse(status_code=400, content={"message": "text is required"})
|
252 |
+
if (text_lang in [None, ""]) :
|
253 |
+
return JSONResponse(status_code=400, content={"message": "text_lang is required"})
|
254 |
+
elif text_lang.lower() not in tts_config.languages:
|
255 |
+
return JSONResponse(status_code=400, content={"message": "text_lang is not supported"})
|
256 |
+
if (prompt_lang in [None, ""]) :
|
257 |
+
return JSONResponse(status_code=400, content={"message": "prompt_lang is required"})
|
258 |
+
elif prompt_lang.lower() not in tts_config.languages:
|
259 |
+
return JSONResponse(status_code=400, content={"message": "prompt_lang is not supported"})
|
260 |
+
if media_type not in ["wav", "raw", "ogg", "aac"]:
|
261 |
+
return JSONResponse(status_code=400, content={"message": "media_type is not supported"})
|
262 |
+
elif media_type == "ogg" and not streaming_mode:
|
263 |
+
return JSONResponse(status_code=400, content={"message": "ogg format is not supported in non-streaming mode"})
|
264 |
+
|
265 |
+
if text_split_method not in cut_method_names:
|
266 |
+
return JSONResponse(status_code=400, content={"message": f"text_split_method:{text_split_method} is not supported"})
|
267 |
+
|
268 |
+
return None
|
269 |
+
|
270 |
+
async def tts_handle(req:dict):
|
271 |
+
"""
|
272 |
+
Text to speech handler.
|
273 |
+
|
274 |
+
Args:
|
275 |
+
req (dict):
|
276 |
+
{
|
277 |
+
"text": "", # str.(required) text to be synthesized
|
278 |
+
"text_lang: "", # str.(required) language of the text to be synthesized
|
279 |
+
"ref_audio_path": "", # str.(required) reference audio path
|
280 |
+
"prompt_text": "", # str.(optional) prompt text for the reference audio
|
281 |
+
"prompt_lang": "", # str.(required) language of the prompt text for the reference audio
|
282 |
+
"top_k": 5, # int. top k sampling
|
283 |
+
"top_p": 1, # float. top p sampling
|
284 |
+
"temperature": 1, # float. temperature for sampling
|
285 |
+
"text_split_method": "cut5", # str. text split method, see text_segmentation_method.py for details.
|
286 |
+
"batch_size": 1, # int. batch size for inference
|
287 |
+
"batch_threshold": 0.75, # float. threshold for batch splitting.
|
288 |
+
"split_bucket: True, # bool. whether to split the batch into multiple buckets.
|
289 |
+
"speed_factor":1.0, # float. control the speed of the synthesized audio.
|
290 |
+
"fragment_interval":0.3, # float. to control the interval of the audio fragment.
|
291 |
+
"seed": -1, # int. random seed for reproducibility.
|
292 |
+
"media_type": "wav", # str. media type of the output audio, support "wav", "raw", "ogg", "aac".
|
293 |
+
"streaming_mode": False, # bool. whether to return a streaming response.
|
294 |
+
"parallel_infer": True, # bool.(optional) whether to use parallel inference.
|
295 |
+
"repetition_penalty": 1.35 # float.(optional) repetition penalty for T2S model.
|
296 |
+
}
|
297 |
+
returns:
|
298 |
+
StreamingResponse: audio stream response.
|
299 |
+
"""
|
300 |
+
|
301 |
+
streaming_mode = req.get("streaming_mode", False)
|
302 |
+
media_type = req.get("media_type", "wav")
|
303 |
+
|
304 |
+
check_res = check_params(req)
|
305 |
+
if check_res is not None:
|
306 |
+
return check_res
|
307 |
+
|
308 |
+
if streaming_mode:
|
309 |
+
req["return_fragment"] = True
|
310 |
+
|
311 |
+
try:
|
312 |
+
tts_generator=tts_pipeline.run(req)
|
313 |
+
|
314 |
+
if streaming_mode:
|
315 |
+
def streaming_generator(tts_generator:Generator, media_type:str):
|
316 |
+
if media_type == "wav":
|
317 |
+
yield wave_header_chunk()
|
318 |
+
media_type = "raw"
|
319 |
+
for sr, chunk in tts_generator:
|
320 |
+
yield pack_audio(BytesIO(), chunk, sr, media_type).getvalue()
|
321 |
+
# _media_type = f"audio/{media_type}" if not (streaming_mode and media_type in ["wav", "raw"]) else f"audio/x-{media_type}"
|
322 |
+
return StreamingResponse(streaming_generator(tts_generator, media_type, ), media_type=f"audio/{media_type}")
|
323 |
+
|
324 |
+
else:
|
325 |
+
sr, audio_data = next(tts_generator)
|
326 |
+
audio_data = pack_audio(BytesIO(), audio_data, sr, media_type).getvalue()
|
327 |
+
return Response(audio_data, media_type=f"audio/{media_type}")
|
328 |
+
except Exception as e:
|
329 |
+
return JSONResponse(status_code=400, content={"message": f"tts failed", "Exception": str(e)})
|
330 |
+
|
331 |
+
|
332 |
+
|
333 |
+
|
334 |
+
|
335 |
+
|
336 |
+
@APP.get("/control")
|
337 |
+
async def control(command: str = None):
|
338 |
+
if command is None:
|
339 |
+
return JSONResponse(status_code=400, content={"message": "command is required"})
|
340 |
+
handle_control(command)
|
341 |
+
|
342 |
+
|
343 |
+
|
344 |
+
@APP.get("/tts")
|
345 |
+
async def tts_get_endpoint(
|
346 |
+
text: str = None,
|
347 |
+
text_lang: str = None,
|
348 |
+
ref_audio_path: str = None,
|
349 |
+
prompt_lang: str = None,
|
350 |
+
prompt_text: str = "",
|
351 |
+
top_k:int = 5,
|
352 |
+
top_p:float = 1,
|
353 |
+
temperature:float = 1,
|
354 |
+
text_split_method:str = "cut0",
|
355 |
+
batch_size:int = 1,
|
356 |
+
batch_threshold:float = 0.75,
|
357 |
+
split_bucket:bool = True,
|
358 |
+
speed_factor:float = 1.0,
|
359 |
+
fragment_interval:float = 0.3,
|
360 |
+
seed:int = -1,
|
361 |
+
media_type:str = "wav",
|
362 |
+
streaming_mode:bool = False,
|
363 |
+
parallel_infer:bool = True,
|
364 |
+
repetition_penalty:float = 1.35
|
365 |
+
):
|
366 |
+
req = {
|
367 |
+
"text": text,
|
368 |
+
"text_lang": text_lang.lower(),
|
369 |
+
"ref_audio_path": ref_audio_path,
|
370 |
+
"prompt_text": prompt_text,
|
371 |
+
"prompt_lang": prompt_lang.lower(),
|
372 |
+
"top_k": top_k,
|
373 |
+
"top_p": top_p,
|
374 |
+
"temperature": temperature,
|
375 |
+
"text_split_method": text_split_method,
|
376 |
+
"batch_size":int(batch_size),
|
377 |
+
"batch_threshold":float(batch_threshold),
|
378 |
+
"speed_factor":float(speed_factor),
|
379 |
+
"split_bucket":split_bucket,
|
380 |
+
"fragment_interval":fragment_interval,
|
381 |
+
"seed":seed,
|
382 |
+
"media_type":media_type,
|
383 |
+
"streaming_mode":streaming_mode,
|
384 |
+
"parallel_infer":parallel_infer,
|
385 |
+
"repetition_penalty":float(repetition_penalty)
|
386 |
+
}
|
387 |
+
return await tts_handle(req)
|
388 |
+
|
389 |
+
|
390 |
+
@APP.post("/tts")
|
391 |
+
async def tts_post_endpoint(request: TTS_Request):
|
392 |
+
req = request.dict()
|
393 |
+
return await tts_handle(req)
|
394 |
+
|
395 |
+
|
396 |
+
@APP.get("/set_refer_audio")
|
397 |
+
async def set_refer_aduio(refer_audio_path: str = None):
|
398 |
+
try:
|
399 |
+
tts_pipeline.set_ref_audio(refer_audio_path)
|
400 |
+
except Exception as e:
|
401 |
+
return JSONResponse(status_code=400, content={"message": f"set refer audio failed", "Exception": str(e)})
|
402 |
+
return JSONResponse(status_code=200, content={"message": "success"})
|
403 |
+
|
404 |
+
|
405 |
+
# @APP.post("/set_refer_audio")
|
406 |
+
# async def set_refer_aduio_post(audio_file: UploadFile = File(...)):
|
407 |
+
# try:
|
408 |
+
# # 检查文件类型,确保是音频文件
|
409 |
+
# if not audio_file.content_type.startswith("audio/"):
|
410 |
+
# return JSONResponse(status_code=400, content={"message": "file type is not supported"})
|
411 |
+
|
412 |
+
# os.makedirs("uploaded_audio", exist_ok=True)
|
413 |
+
# save_path = os.path.join("uploaded_audio", audio_file.filename)
|
414 |
+
# # 保存音频文件到服务器上的一个目录
|
415 |
+
# with open(save_path , "wb") as buffer:
|
416 |
+
# buffer.write(await audio_file.read())
|
417 |
+
|
418 |
+
# tts_pipeline.set_ref_audio(save_path)
|
419 |
+
# except Exception as e:
|
420 |
+
# return JSONResponse(status_code=400, content={"message": f"set refer audio failed", "Exception": str(e)})
|
421 |
+
# return JSONResponse(status_code=200, content={"message": "success"})
|
422 |
+
|
423 |
+
@APP.get("/set_gpt_weights")
|
424 |
+
async def set_gpt_weights(weights_path: str = None):
|
425 |
+
try:
|
426 |
+
if weights_path in ["", None]:
|
427 |
+
return JSONResponse(status_code=400, content={"message": "gpt weight path is required"})
|
428 |
+
tts_pipeline.init_t2s_weights(weights_path)
|
429 |
+
except Exception as e:
|
430 |
+
return JSONResponse(status_code=400, content={"message": f"change gpt weight failed", "Exception": str(e)})
|
431 |
+
|
432 |
+
return JSONResponse(status_code=200, content={"message": "success"})
|
433 |
+
|
434 |
+
|
435 |
+
@APP.get("/set_sovits_weights")
|
436 |
+
async def set_sovits_weights(weights_path: str = None):
|
437 |
+
try:
|
438 |
+
if weights_path in ["", None]:
|
439 |
+
return JSONResponse(status_code=400, content={"message": "sovits weight path is required"})
|
440 |
+
tts_pipeline.init_vits_weights(weights_path)
|
441 |
+
except Exception as e:
|
442 |
+
return JSONResponse(status_code=400, content={"message": f"change sovits weight failed", "Exception": str(e)})
|
443 |
+
return JSONResponse(status_code=200, content={"message": "success"})
|
444 |
+
|
445 |
+
|
446 |
+
|
447 |
+
if __name__ == "__main__":
|
448 |
+
try:
|
449 |
+
uvicorn.run(APP, host=host, port=port, workers=1)
|
450 |
+
except Exception as e:
|
451 |
+
traceback.print_exc()
|
452 |
+
os.kill(os.getpid(), signal.SIGTERM)
|
453 |
+
exit(0)
|
config.py
CHANGED
@@ -1,66 +1,66 @@
|
|
1 |
-
import sys,os
|
2 |
-
|
3 |
-
import torch
|
4 |
-
|
5 |
-
# 推理用的指定模型
|
6 |
-
sovits_path = ""
|
7 |
-
gpt_path = ""
|
8 |
-
is_half_str = os.environ.get("is_half", "True")
|
9 |
-
is_half = True if is_half_str.lower() == 'true' else False
|
10 |
-
is_share_str = os.environ.get("is_share","False")
|
11 |
-
is_share= True if is_share_str.lower() == 'true' else False
|
12 |
-
|
13 |
-
cnhubert_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base"
|
14 |
-
bert_path = "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"
|
15 |
-
pretrained_sovits_path = "GPT_SoVITS/pretrained_models/s2G488k.pth"
|
16 |
-
pretrained_gpt_path = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
|
17 |
-
|
18 |
-
exp_root = "logs"
|
19 |
-
python_exec = sys.executable or "python"
|
20 |
-
if torch.cuda.is_available():
|
21 |
-
infer_device = "cuda"
|
22 |
-
else:
|
23 |
-
infer_device = "cpu"
|
24 |
-
|
25 |
-
webui_port_main = 9874
|
26 |
-
webui_port_uvr5 = 9873
|
27 |
-
webui_port_infer_tts = 9872
|
28 |
-
webui_port_subfix = 9871
|
29 |
-
|
30 |
-
api_port = 9880
|
31 |
-
|
32 |
-
if infer_device == "cuda":
|
33 |
-
gpu_name = torch.cuda.get_device_name(0)
|
34 |
-
if (
|
35 |
-
("16" in gpu_name and "V100" not in gpu_name.upper())
|
36 |
-
or "P40" in gpu_name.upper()
|
37 |
-
or "P10" in gpu_name.upper()
|
38 |
-
or "1060" in gpu_name
|
39 |
-
or "1070" in gpu_name
|
40 |
-
or "1080" in gpu_name
|
41 |
-
):
|
42 |
-
is_half=False
|
43 |
-
|
44 |
-
if(infer_device=="cpu"):is_half=False
|
45 |
-
|
46 |
-
class Config:
|
47 |
-
def __init__(self):
|
48 |
-
self.sovits_path = sovits_path
|
49 |
-
self.gpt_path = gpt_path
|
50 |
-
self.is_half = is_half
|
51 |
-
|
52 |
-
self.cnhubert_path = cnhubert_path
|
53 |
-
self.bert_path = bert_path
|
54 |
-
self.pretrained_sovits_path = pretrained_sovits_path
|
55 |
-
self.pretrained_gpt_path = pretrained_gpt_path
|
56 |
-
|
57 |
-
self.exp_root = exp_root
|
58 |
-
self.python_exec = python_exec
|
59 |
-
self.infer_device = infer_device
|
60 |
-
|
61 |
-
self.webui_port_main = webui_port_main
|
62 |
-
self.webui_port_uvr5 = webui_port_uvr5
|
63 |
-
self.webui_port_infer_tts = webui_port_infer_tts
|
64 |
-
self.webui_port_subfix = webui_port_subfix
|
65 |
-
|
66 |
-
self.api_port = api_port
|
|
|
1 |
+
import sys,os
|
2 |
+
|
3 |
+
import torch
|
4 |
+
|
5 |
+
# 推理用的指定模型
|
6 |
+
sovits_path = ""
|
7 |
+
gpt_path = ""
|
8 |
+
is_half_str = os.environ.get("is_half", "True")
|
9 |
+
is_half = True if is_half_str.lower() == 'true' else False
|
10 |
+
is_share_str = os.environ.get("is_share","False")
|
11 |
+
is_share= True if is_share_str.lower() == 'true' else False
|
12 |
+
|
13 |
+
cnhubert_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base"
|
14 |
+
bert_path = "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"
|
15 |
+
pretrained_sovits_path = "GPT_SoVITS/pretrained_models/s2G488k.pth"
|
16 |
+
pretrained_gpt_path = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
|
17 |
+
|
18 |
+
exp_root = "logs"
|
19 |
+
python_exec = sys.executable or "python"
|
20 |
+
if torch.cuda.is_available():
|
21 |
+
infer_device = "cuda"
|
22 |
+
else:
|
23 |
+
infer_device = "cpu"
|
24 |
+
|
25 |
+
webui_port_main = 9874
|
26 |
+
webui_port_uvr5 = 9873
|
27 |
+
webui_port_infer_tts = 9872
|
28 |
+
webui_port_subfix = 9871
|
29 |
+
|
30 |
+
api_port = 9880
|
31 |
+
|
32 |
+
if infer_device == "cuda":
|
33 |
+
gpu_name = torch.cuda.get_device_name(0)
|
34 |
+
if (
|
35 |
+
("16" in gpu_name and "V100" not in gpu_name.upper())
|
36 |
+
or "P40" in gpu_name.upper()
|
37 |
+
or "P10" in gpu_name.upper()
|
38 |
+
or "1060" in gpu_name
|
39 |
+
or "1070" in gpu_name
|
40 |
+
or "1080" in gpu_name
|
41 |
+
):
|
42 |
+
is_half=False
|
43 |
+
|
44 |
+
if(infer_device=="cpu"):is_half=False
|
45 |
+
|
46 |
+
class Config:
|
47 |
+
def __init__(self):
|
48 |
+
self.sovits_path = sovits_path
|
49 |
+
self.gpt_path = gpt_path
|
50 |
+
self.is_half = is_half
|
51 |
+
|
52 |
+
self.cnhubert_path = cnhubert_path
|
53 |
+
self.bert_path = bert_path
|
54 |
+
self.pretrained_sovits_path = pretrained_sovits_path
|
55 |
+
self.pretrained_gpt_path = pretrained_gpt_path
|
56 |
+
|
57 |
+
self.exp_root = exp_root
|
58 |
+
self.python_exec = python_exec
|
59 |
+
self.infer_device = infer_device
|
60 |
+
|
61 |
+
self.webui_port_main = webui_port_main
|
62 |
+
self.webui_port_uvr5 = webui_port_uvr5
|
63 |
+
self.webui_port_infer_tts = webui_port_infer_tts
|
64 |
+
self.webui_port_subfix = webui_port_subfix
|
65 |
+
|
66 |
+
self.api_port = api_port
|
go-webui.bat
CHANGED
@@ -1,2 +1,2 @@
|
|
1 |
-
runtime\python.exe webui.py
|
2 |
-
pause
|
|
|
1 |
+
runtime\python.exe webui.py
|
2 |
+
pause
|
requirements.txt
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
numpy
|
2 |
scipy
|
3 |
tensorboard
|
4 |
librosa==0.9.2
|
@@ -25,14 +25,4 @@ jieba_fast
|
|
25 |
jieba
|
26 |
LangSegment>=0.2.0
|
27 |
Faster_Whisper
|
28 |
-
wordsegment
|
29 |
-
faster-whisper==0.9.0
|
30 |
-
pydub==0.25.1
|
31 |
-
wavmark==0.0.3
|
32 |
-
eng_to_ipa==0.0.2
|
33 |
-
inflect==7.0.0
|
34 |
-
unidecode==1.3.7
|
35 |
-
whisper-timestamped==1.14.2
|
36 |
-
openai
|
37 |
-
python-dotenv
|
38 |
-
langid==1.1.6
|
|
|
1 |
+
numpy
|
2 |
scipy
|
3 |
tensorboard
|
4 |
librosa==0.9.2
|
|
|
25 |
jieba
|
26 |
LangSegment>=0.2.0
|
27 |
Faster_Whisper
|
28 |
+
wordsegment
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tools/asr/fasterwhisper_asr.py
CHANGED
@@ -1,18 +1,16 @@
|
|
1 |
import argparse
|
2 |
import os
|
3 |
-
os.environ["HF_ENDPOINT"]="https://hf-mirror.com"
|
4 |
import traceback
|
5 |
-
import requests
|
6 |
-
from glob import glob
|
7 |
-
import torch
|
8 |
|
|
|
|
|
|
|
|
|
9 |
from faster_whisper import WhisperModel
|
10 |
from tqdm import tqdm
|
11 |
|
12 |
from tools.asr.config import check_fw_local_models
|
13 |
|
14 |
-
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
|
15 |
-
|
16 |
language_code_list = [
|
17 |
"af", "am", "ar", "as", "az",
|
18 |
"ba", "be", "bg", "bn", "bo",
|
@@ -36,7 +34,7 @@ language_code_list = [
|
|
36 |
"vi", "yi", "yo", "zh", "yue",
|
37 |
"auto"]
|
38 |
|
39 |
-
def execute_asr(input_folder, output_folder, model_size, language,precision):
|
40 |
if '-local' in model_size:
|
41 |
model_size = model_size[:-6]
|
42 |
model_path = f'tools/asr/models/faster-whisper-{model_size}'
|
@@ -50,17 +48,18 @@ def execute_asr(input_folder, output_folder, model_size, language,precision):
|
|
50 |
model = WhisperModel(model_path, device=device, compute_type=precision)
|
51 |
except:
|
52 |
return print(traceback.format_exc())
|
|
|
|
|
|
|
|
|
53 |
output = []
|
54 |
output_file_name = os.path.basename(input_folder)
|
55 |
-
|
56 |
-
|
57 |
-
if not os.path.exists(output_folder):
|
58 |
-
os.makedirs(output_folder)
|
59 |
-
|
60 |
-
for file in tqdm(glob(os.path.join(input_folder, '**/*.wav'), recursive=True)):
|
61 |
try:
|
|
|
62 |
segments, info = model.transcribe(
|
63 |
-
audio =
|
64 |
beam_size = 5,
|
65 |
vad_filter = True,
|
66 |
vad_parameters = dict(min_silence_duration_ms=700),
|
@@ -68,18 +67,23 @@ def execute_asr(input_folder, output_folder, model_size, language,precision):
|
|
68 |
text = ''
|
69 |
|
70 |
if info.language == "zh":
|
71 |
-
print("
|
72 |
if("only_asr"not in globals()):
|
73 |
-
from tools.asr.funasr_asr import
|
74 |
-
|
|
|
75 |
|
76 |
if text == '':
|
77 |
for segment in segments:
|
78 |
text += segment.text
|
79 |
-
output.append(f"{
|
80 |
except:
|
81 |
return print(traceback.format_exc())
|
82 |
-
|
|
|
|
|
|
|
|
|
83 |
with open(output_file_path, "w", encoding="utf-8") as f:
|
84 |
f.write("\n".join(output))
|
85 |
print(f"ASR 任务完成->标注文件路径: {output_file_path}\n")
|
|
|
1 |
import argparse
|
2 |
import os
|
|
|
3 |
import traceback
|
|
|
|
|
|
|
4 |
|
5 |
+
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
|
6 |
+
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
7 |
+
|
8 |
+
import torch
|
9 |
from faster_whisper import WhisperModel
|
10 |
from tqdm import tqdm
|
11 |
|
12 |
from tools.asr.config import check_fw_local_models
|
13 |
|
|
|
|
|
14 |
language_code_list = [
|
15 |
"af", "am", "ar", "as", "az",
|
16 |
"ba", "be", "bg", "bn", "bo",
|
|
|
34 |
"vi", "yi", "yo", "zh", "yue",
|
35 |
"auto"]
|
36 |
|
37 |
+
def execute_asr(input_folder, output_folder, model_size, language, precision):
|
38 |
if '-local' in model_size:
|
39 |
model_size = model_size[:-6]
|
40 |
model_path = f'tools/asr/models/faster-whisper-{model_size}'
|
|
|
48 |
model = WhisperModel(model_path, device=device, compute_type=precision)
|
49 |
except:
|
50 |
return print(traceback.format_exc())
|
51 |
+
|
52 |
+
input_file_names = os.listdir(input_folder)
|
53 |
+
input_file_names.sort()
|
54 |
+
|
55 |
output = []
|
56 |
output_file_name = os.path.basename(input_folder)
|
57 |
+
|
58 |
+
for file_name in tqdm(input_file_names):
|
|
|
|
|
|
|
|
|
59 |
try:
|
60 |
+
file_path = os.path.join(input_folder, file_name)
|
61 |
segments, info = model.transcribe(
|
62 |
+
audio = file_path,
|
63 |
beam_size = 5,
|
64 |
vad_filter = True,
|
65 |
vad_parameters = dict(min_silence_duration_ms=700),
|
|
|
67 |
text = ''
|
68 |
|
69 |
if info.language == "zh":
|
70 |
+
print("检测为中文文本, 转 FunASR 处理")
|
71 |
if("only_asr"not in globals()):
|
72 |
+
from tools.asr.funasr_asr import \
|
73 |
+
only_asr # #如果用英文就不需要导入下载模型
|
74 |
+
text = only_asr(file_path)
|
75 |
|
76 |
if text == '':
|
77 |
for segment in segments:
|
78 |
text += segment.text
|
79 |
+
output.append(f"{file_path}|{output_file_name}|{info.language.upper()}|{text}")
|
80 |
except:
|
81 |
return print(traceback.format_exc())
|
82 |
+
|
83 |
+
output_folder = output_folder or "output/asr_opt"
|
84 |
+
os.makedirs(output_folder, exist_ok=True)
|
85 |
+
output_file_path = os.path.abspath(f'{output_folder}/{output_file_name}.list')
|
86 |
+
|
87 |
with open(output_file_path, "w", encoding="utf-8") as f:
|
88 |
f.write("\n".join(output))
|
89 |
print(f"ASR 任务完成->标注文件路径: {output_file_path}\n")
|
tools/asr/funasr_asr.py
CHANGED
@@ -38,10 +38,11 @@ def execute_asr(input_folder, output_folder, model_size, language):
|
|
38 |
output = []
|
39 |
output_file_name = os.path.basename(input_folder)
|
40 |
|
41 |
-
for
|
42 |
try:
|
43 |
-
|
44 |
-
|
|
|
45 |
except:
|
46 |
print(traceback.format_exc())
|
47 |
|
|
|
38 |
output = []
|
39 |
output_file_name = os.path.basename(input_folder)
|
40 |
|
41 |
+
for file_name in tqdm(input_file_names):
|
42 |
try:
|
43 |
+
file_path = os.path.join(input_folder, file_name)
|
44 |
+
text = model.generate(input=file_path)[0]["text"]
|
45 |
+
output.append(f"{file_path}|{output_file_name}|{language.upper()}|{text}")
|
46 |
except:
|
47 |
print(traceback.format_exc())
|
48 |
|
tools/cmd-denoise.py
CHANGED
@@ -1,29 +1,29 @@
|
|
1 |
-
import os,argparse
|
2 |
-
|
3 |
-
from modelscope.pipelines import pipeline
|
4 |
-
from modelscope.utils.constant import Tasks
|
5 |
-
from tqdm import tqdm
|
6 |
-
|
7 |
-
path_denoise = 'tools/denoise-model/speech_frcrn_ans_cirm_16k'
|
8 |
-
path_denoise = path_denoise if os.path.exists(path_denoise) else "damo/speech_frcrn_ans_cirm_16k"
|
9 |
-
ans = pipeline(Tasks.acoustic_noise_suppression,model=path_denoise)
|
10 |
-
def execute_denoise(input_folder,output_folder):
|
11 |
-
os.makedirs(output_folder,exist_ok=True)
|
12 |
-
# print(input_folder)
|
13 |
-
# print(list(os.listdir(input_folder).sort()))
|
14 |
-
for name in tqdm(os.listdir(input_folder)):
|
15 |
-
ans("%s/%s"%(input_folder,name),output_path='%s/%s'%(output_folder,name))
|
16 |
-
|
17 |
-
if __name__ == '__main__':
|
18 |
-
parser = argparse.ArgumentParser()
|
19 |
-
parser.add_argument("-i", "--input_folder", type=str, required=True,
|
20 |
-
help="Path to the folder containing WAV files.")
|
21 |
-
parser.add_argument("-o", "--output_folder", type=str, required=True,
|
22 |
-
help="Output folder to store transcriptions.")
|
23 |
-
parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'],
|
24 |
-
help="fp16 or fp32")#还没接入
|
25 |
-
cmd = parser.parse_args()
|
26 |
-
execute_denoise(
|
27 |
-
input_folder = cmd.input_folder,
|
28 |
-
output_folder = cmd.output_folder,
|
29 |
)
|
|
|
1 |
+
import os,argparse
|
2 |
+
|
3 |
+
from modelscope.pipelines import pipeline
|
4 |
+
from modelscope.utils.constant import Tasks
|
5 |
+
from tqdm import tqdm
|
6 |
+
|
7 |
+
path_denoise = 'tools/denoise-model/speech_frcrn_ans_cirm_16k'
|
8 |
+
path_denoise = path_denoise if os.path.exists(path_denoise) else "damo/speech_frcrn_ans_cirm_16k"
|
9 |
+
ans = pipeline(Tasks.acoustic_noise_suppression,model=path_denoise)
|
10 |
+
def execute_denoise(input_folder,output_folder):
|
11 |
+
os.makedirs(output_folder,exist_ok=True)
|
12 |
+
# print(input_folder)
|
13 |
+
# print(list(os.listdir(input_folder).sort()))
|
14 |
+
for name in tqdm(os.listdir(input_folder)):
|
15 |
+
ans("%s/%s"%(input_folder,name),output_path='%s/%s'%(output_folder,name))
|
16 |
+
|
17 |
+
if __name__ == '__main__':
|
18 |
+
parser = argparse.ArgumentParser()
|
19 |
+
parser.add_argument("-i", "--input_folder", type=str, required=True,
|
20 |
+
help="Path to the folder containing WAV files.")
|
21 |
+
parser.add_argument("-o", "--output_folder", type=str, required=True,
|
22 |
+
help="Output folder to store transcriptions.")
|
23 |
+
parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'],
|
24 |
+
help="fp16 or fp32")#还没接入
|
25 |
+
cmd = parser.parse_args()
|
26 |
+
execute_denoise(
|
27 |
+
input_folder = cmd.input_folder,
|
28 |
+
output_folder = cmd.output_folder,
|
29 |
)
|
tools/i18n/i18n.py
CHANGED
@@ -4,7 +4,7 @@ import os
|
|
4 |
|
5 |
|
6 |
def load_language_list(language):
|
7 |
-
with open(f"./i18n/locale/
|
8 |
language_list = json.load(f)
|
9 |
return language_list
|
10 |
|
|
|
4 |
|
5 |
|
6 |
def load_language_list(language):
|
7 |
+
with open(f"./i18n/locale/{language}.json", "r", encoding="utf-8") as f:
|
8 |
language_list = json.load(f)
|
9 |
return language_list
|
10 |
|
tools/my_utils.py
CHANGED
@@ -1,31 +1,31 @@
|
|
1 |
-
import platform,os,traceback
|
2 |
-
import ffmpeg
|
3 |
-
import numpy as np
|
4 |
-
|
5 |
-
|
6 |
-
def load_audio(file, sr):
|
7 |
-
try:
|
8 |
-
# https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
|
9 |
-
# This launches a subprocess to decode audio while down-mixing and resampling as necessary.
|
10 |
-
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
|
11 |
-
file = clean_path(file) # 防止小白拷路径头尾带了空格和"和回车
|
12 |
-
if os.path.exists(file) == False:
|
13 |
-
raise RuntimeError(
|
14 |
-
"You input a wrong audio path that does not exists, please fix it!"
|
15 |
-
)
|
16 |
-
out, _ = (
|
17 |
-
ffmpeg.input(file, threads=0)
|
18 |
-
.output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
|
19 |
-
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
|
20 |
-
)
|
21 |
-
except Exception as e:
|
22 |
-
traceback.print_exc()
|
23 |
-
raise RuntimeError(f"Failed to load audio: {e}")
|
24 |
-
|
25 |
-
return np.frombuffer(out, np.float32).flatten()
|
26 |
-
|
27 |
-
|
28 |
-
def clean_path(path_str):
|
29 |
-
if platform.system() == 'Windows':
|
30 |
-
path_str = path_str.replace('/', '\\')
|
31 |
-
return path_str.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
|
|
|
1 |
+
import platform,os,traceback
|
2 |
+
import ffmpeg
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
|
6 |
+
def load_audio(file, sr):
|
7 |
+
try:
|
8 |
+
# https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
|
9 |
+
# This launches a subprocess to decode audio while down-mixing and resampling as necessary.
|
10 |
+
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
|
11 |
+
file = clean_path(file) # 防止小白拷路径头尾带了空格和"和回车
|
12 |
+
if os.path.exists(file) == False:
|
13 |
+
raise RuntimeError(
|
14 |
+
"You input a wrong audio path that does not exists, please fix it!"
|
15 |
+
)
|
16 |
+
out, _ = (
|
17 |
+
ffmpeg.input(file, threads=0)
|
18 |
+
.output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
|
19 |
+
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
|
20 |
+
)
|
21 |
+
except Exception as e:
|
22 |
+
traceback.print_exc()
|
23 |
+
raise RuntimeError(f"Failed to load audio: {e}")
|
24 |
+
|
25 |
+
return np.frombuffer(out, np.float32).flatten()
|
26 |
+
|
27 |
+
|
28 |
+
def clean_path(path_str):
|
29 |
+
if platform.system() == 'Windows':
|
30 |
+
path_str = path_str.replace('/', '\\')
|
31 |
+
return path_str.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
|
tools/slice_audio.py
CHANGED
@@ -1,48 +1,48 @@
|
|
1 |
-
import os,sys,numpy as np
|
2 |
-
import traceback
|
3 |
-
from scipy.io import wavfile
|
4 |
-
# parent_directory = os.path.dirname(os.path.abspath(__file__))
|
5 |
-
# sys.path.append(parent_directory)
|
6 |
-
from my_utils import load_audio
|
7 |
-
from slicer2 import Slicer
|
8 |
-
|
9 |
-
def slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,i_part,all_part):
|
10 |
-
os.makedirs(opt_root,exist_ok=True)
|
11 |
-
if os.path.isfile(inp):
|
12 |
-
input=[inp]
|
13 |
-
elif os.path.isdir(inp):
|
14 |
-
input=[os.path.join(inp, name) for name in sorted(list(os.listdir(inp)))]
|
15 |
-
else:
|
16 |
-
return "输入路径存在但既不是文件也不是文件夹"
|
17 |
-
slicer = Slicer(
|
18 |
-
sr=32000, # 长音频采样率
|
19 |
-
threshold= int(threshold), # 音量小于这个值视作静音的备选切割点
|
20 |
-
min_length= int(min_length), # 每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值
|
21 |
-
min_interval= int(min_interval), # 最短切割间隔
|
22 |
-
hop_size= int(hop_size), # 怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)
|
23 |
-
max_sil_kept= int(max_sil_kept), # 切完后静音最多留多长
|
24 |
-
)
|
25 |
-
_max=float(_max)
|
26 |
-
alpha=float(alpha)
|
27 |
-
for inp_path in input[int(i_part)::int(all_part)]:
|
28 |
-
# print(inp_path)
|
29 |
-
try:
|
30 |
-
name = os.path.basename(inp_path)
|
31 |
-
audio = load_audio(inp_path, 32000)
|
32 |
-
# print(audio.shape)
|
33 |
-
for chunk, start, end in slicer.slice(audio): # start和end是帧数
|
34 |
-
tmp_max = np.abs(chunk).max()
|
35 |
-
if(tmp_max>1):chunk/=tmp_max
|
36 |
-
chunk = (chunk / tmp_max * (_max * alpha)) + (1 - alpha) * chunk
|
37 |
-
wavfile.write(
|
38 |
-
"%s/%s_%010d_%010d.wav" % (opt_root, name, start, end),
|
39 |
-
32000,
|
40 |
-
# chunk.astype(np.float32),
|
41 |
-
(chunk * 32767).astype(np.int16),
|
42 |
-
)
|
43 |
-
except:
|
44 |
-
print(inp_path,"->fail->",traceback.format_exc())
|
45 |
-
return "执行完毕,请检查输出文件"
|
46 |
-
|
47 |
-
print(slice(*sys.argv[1:]))
|
48 |
-
|
|
|
1 |
+
import os,sys,numpy as np
|
2 |
+
import traceback
|
3 |
+
from scipy.io import wavfile
|
4 |
+
# parent_directory = os.path.dirname(os.path.abspath(__file__))
|
5 |
+
# sys.path.append(parent_directory)
|
6 |
+
from my_utils import load_audio
|
7 |
+
from slicer2 import Slicer
|
8 |
+
|
9 |
+
def slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,i_part,all_part):
|
10 |
+
os.makedirs(opt_root,exist_ok=True)
|
11 |
+
if os.path.isfile(inp):
|
12 |
+
input=[inp]
|
13 |
+
elif os.path.isdir(inp):
|
14 |
+
input=[os.path.join(inp, name) for name in sorted(list(os.listdir(inp)))]
|
15 |
+
else:
|
16 |
+
return "输入路径存在但既不是文件也不是文件夹"
|
17 |
+
slicer = Slicer(
|
18 |
+
sr=32000, # 长音频采样率
|
19 |
+
threshold= int(threshold), # 音量小于这个值视作静音的备选切割点
|
20 |
+
min_length= int(min_length), # 每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值
|
21 |
+
min_interval= int(min_interval), # 最短切割间隔
|
22 |
+
hop_size= int(hop_size), # 怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)
|
23 |
+
max_sil_kept= int(max_sil_kept), # 切完后静音最多留多长
|
24 |
+
)
|
25 |
+
_max=float(_max)
|
26 |
+
alpha=float(alpha)
|
27 |
+
for inp_path in input[int(i_part)::int(all_part)]:
|
28 |
+
# print(inp_path)
|
29 |
+
try:
|
30 |
+
name = os.path.basename(inp_path)
|
31 |
+
audio = load_audio(inp_path, 32000)
|
32 |
+
# print(audio.shape)
|
33 |
+
for chunk, start, end in slicer.slice(audio): # start和end是帧数
|
34 |
+
tmp_max = np.abs(chunk).max()
|
35 |
+
if(tmp_max>1):chunk/=tmp_max
|
36 |
+
chunk = (chunk / tmp_max * (_max * alpha)) + (1 - alpha) * chunk
|
37 |
+
wavfile.write(
|
38 |
+
"%s/%s_%010d_%010d.wav" % (opt_root, name, start, end),
|
39 |
+
32000,
|
40 |
+
# chunk.astype(np.float32),
|
41 |
+
(chunk * 32767).astype(np.int16),
|
42 |
+
)
|
43 |
+
except:
|
44 |
+
print(inp_path,"->fail->",traceback.format_exc())
|
45 |
+
return "执行完毕,请检查输出文件"
|
46 |
+
|
47 |
+
print(slice(*sys.argv[1:]))
|
48 |
+
|
tools/slicer2.py
CHANGED
@@ -1,261 +1,261 @@
|
|
1 |
-
import numpy as np
|
2 |
-
|
3 |
-
|
4 |
-
# This function is obtained from librosa.
|
5 |
-
def get_rms(
|
6 |
-
y,
|
7 |
-
frame_length=2048,
|
8 |
-
hop_length=512,
|
9 |
-
pad_mode="constant",
|
10 |
-
):
|
11 |
-
padding = (int(frame_length // 2), int(frame_length // 2))
|
12 |
-
y = np.pad(y, padding, mode=pad_mode)
|
13 |
-
|
14 |
-
axis = -1
|
15 |
-
# put our new within-frame axis at the end for now
|
16 |
-
out_strides = y.strides + tuple([y.strides[axis]])
|
17 |
-
# Reduce the shape on the framing axis
|
18 |
-
x_shape_trimmed = list(y.shape)
|
19 |
-
x_shape_trimmed[axis] -= frame_length - 1
|
20 |
-
out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
|
21 |
-
xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides)
|
22 |
-
if axis < 0:
|
23 |
-
target_axis = axis - 1
|
24 |
-
else:
|
25 |
-
target_axis = axis + 1
|
26 |
-
xw = np.moveaxis(xw, -1, target_axis)
|
27 |
-
# Downsample along the target axis
|
28 |
-
slices = [slice(None)] * xw.ndim
|
29 |
-
slices[axis] = slice(0, None, hop_length)
|
30 |
-
x = xw[tuple(slices)]
|
31 |
-
|
32 |
-
# Calculate power
|
33 |
-
power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True)
|
34 |
-
|
35 |
-
return np.sqrt(power)
|
36 |
-
|
37 |
-
|
38 |
-
class Slicer:
|
39 |
-
def __init__(
|
40 |
-
self,
|
41 |
-
sr: int,
|
42 |
-
threshold: float = -40.0,
|
43 |
-
min_length: int = 5000,
|
44 |
-
min_interval: int = 300,
|
45 |
-
hop_size: int = 20,
|
46 |
-
max_sil_kept: int = 5000,
|
47 |
-
):
|
48 |
-
if not min_length >= min_interval >= hop_size:
|
49 |
-
raise ValueError(
|
50 |
-
"The following condition must be satisfied: min_length >= min_interval >= hop_size"
|
51 |
-
)
|
52 |
-
if not max_sil_kept >= hop_size:
|
53 |
-
raise ValueError(
|
54 |
-
"The following condition must be satisfied: max_sil_kept >= hop_size"
|
55 |
-
)
|
56 |
-
min_interval = sr * min_interval / 1000
|
57 |
-
self.threshold = 10 ** (threshold / 20.0)
|
58 |
-
self.hop_size = round(sr * hop_size / 1000)
|
59 |
-
self.win_size = min(round(min_interval), 4 * self.hop_size)
|
60 |
-
self.min_length = round(sr * min_length / 1000 / self.hop_size)
|
61 |
-
self.min_interval = round(min_interval / self.hop_size)
|
62 |
-
self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
|
63 |
-
|
64 |
-
def _apply_slice(self, waveform, begin, end):
|
65 |
-
if len(waveform.shape) > 1:
|
66 |
-
return waveform[
|
67 |
-
:, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size)
|
68 |
-
]
|
69 |
-
else:
|
70 |
-
return waveform[
|
71 |
-
begin * self.hop_size : min(waveform.shape[0], end * self.hop_size)
|
72 |
-
]
|
73 |
-
|
74 |
-
# @timeit
|
75 |
-
def slice(self, waveform):
|
76 |
-
if len(waveform.shape) > 1:
|
77 |
-
samples = waveform.mean(axis=0)
|
78 |
-
else:
|
79 |
-
samples = waveform
|
80 |
-
if samples.shape[0] <= self.min_length:
|
81 |
-
return [waveform]
|
82 |
-
rms_list = get_rms(
|
83 |
-
y=samples, frame_length=self.win_size, hop_length=self.hop_size
|
84 |
-
).squeeze(0)
|
85 |
-
sil_tags = []
|
86 |
-
silence_start = None
|
87 |
-
clip_start = 0
|
88 |
-
for i, rms in enumerate(rms_list):
|
89 |
-
# Keep looping while frame is silent.
|
90 |
-
if rms < self.threshold:
|
91 |
-
# Record start of silent frames.
|
92 |
-
if silence_start is None:
|
93 |
-
silence_start = i
|
94 |
-
continue
|
95 |
-
# Keep looping while frame is not silent and silence start has not been recorded.
|
96 |
-
if silence_start is None:
|
97 |
-
continue
|
98 |
-
# Clear recorded silence start if interval is not enough or clip is too short
|
99 |
-
is_leading_silence = silence_start == 0 and i > self.max_sil_kept
|
100 |
-
need_slice_middle = (
|
101 |
-
i - silence_start >= self.min_interval
|
102 |
-
and i - clip_start >= self.min_length
|
103 |
-
)
|
104 |
-
if not is_leading_silence and not need_slice_middle:
|
105 |
-
silence_start = None
|
106 |
-
continue
|
107 |
-
# Need slicing. Record the range of silent frames to be removed.
|
108 |
-
if i - silence_start <= self.max_sil_kept:
|
109 |
-
pos = rms_list[silence_start : i + 1].argmin() + silence_start
|
110 |
-
if silence_start == 0:
|
111 |
-
sil_tags.append((0, pos))
|
112 |
-
else:
|
113 |
-
sil_tags.append((pos, pos))
|
114 |
-
clip_start = pos
|
115 |
-
elif i - silence_start <= self.max_sil_kept * 2:
|
116 |
-
pos = rms_list[
|
117 |
-
i - self.max_sil_kept : silence_start + self.max_sil_kept + 1
|
118 |
-
].argmin()
|
119 |
-
pos += i - self.max_sil_kept
|
120 |
-
pos_l = (
|
121 |
-
rms_list[
|
122 |
-
silence_start : silence_start + self.max_sil_kept + 1
|
123 |
-
].argmin()
|
124 |
-
+ silence_start
|
125 |
-
)
|
126 |
-
pos_r = (
|
127 |
-
rms_list[i - self.max_sil_kept : i + 1].argmin()
|
128 |
-
+ i
|
129 |
-
- self.max_sil_kept
|
130 |
-
)
|
131 |
-
if silence_start == 0:
|
132 |
-
sil_tags.append((0, pos_r))
|
133 |
-
clip_start = pos_r
|
134 |
-
else:
|
135 |
-
sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
|
136 |
-
clip_start = max(pos_r, pos)
|
137 |
-
else:
|
138 |
-
pos_l = (
|
139 |
-
rms_list[
|
140 |
-
silence_start : silence_start + self.max_sil_kept + 1
|
141 |
-
].argmin()
|
142 |
-
+ silence_start
|
143 |
-
)
|
144 |
-
pos_r = (
|
145 |
-
rms_list[i - self.max_sil_kept : i + 1].argmin()
|
146 |
-
+ i
|
147 |
-
- self.max_sil_kept
|
148 |
-
)
|
149 |
-
if silence_start == 0:
|
150 |
-
sil_tags.append((0, pos_r))
|
151 |
-
else:
|
152 |
-
sil_tags.append((pos_l, pos_r))
|
153 |
-
clip_start = pos_r
|
154 |
-
silence_start = None
|
155 |
-
# Deal with trailing silence.
|
156 |
-
total_frames = rms_list.shape[0]
|
157 |
-
if (
|
158 |
-
silence_start is not None
|
159 |
-
and total_frames - silence_start >= self.min_interval
|
160 |
-
):
|
161 |
-
silence_end = min(total_frames, silence_start + self.max_sil_kept)
|
162 |
-
pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start
|
163 |
-
sil_tags.append((pos, total_frames + 1))
|
164 |
-
# Apply and return slices.
|
165 |
-
####音频+起始时间+终止时间
|
166 |
-
if len(sil_tags) == 0:
|
167 |
-
return [[waveform,0,int(total_frames*self.hop_size)]]
|
168 |
-
else:
|
169 |
-
chunks = []
|
170 |
-
if sil_tags[0][0] > 0:
|
171 |
-
chunks.append([self._apply_slice(waveform, 0, sil_tags[0][0]),0,int(sil_tags[0][0]*self.hop_size)])
|
172 |
-
for i in range(len(sil_tags) - 1):
|
173 |
-
chunks.append(
|
174 |
-
[self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]),int(sil_tags[i][1]*self.hop_size),int(sil_tags[i + 1][0]*self.hop_size)]
|
175 |
-
)
|
176 |
-
if sil_tags[-1][1] < total_frames:
|
177 |
-
chunks.append(
|
178 |
-
[self._apply_slice(waveform, sil_tags[-1][1], total_frames),int(sil_tags[-1][1]*self.hop_size),int(total_frames*self.hop_size)]
|
179 |
-
)
|
180 |
-
return chunks
|
181 |
-
|
182 |
-
|
183 |
-
def main():
|
184 |
-
import os.path
|
185 |
-
from argparse import ArgumentParser
|
186 |
-
|
187 |
-
import librosa
|
188 |
-
import soundfile
|
189 |
-
|
190 |
-
parser = ArgumentParser()
|
191 |
-
parser.add_argument("audio", type=str, help="The audio to be sliced")
|
192 |
-
parser.add_argument(
|
193 |
-
"--out", type=str, help="Output directory of the sliced audio clips"
|
194 |
-
)
|
195 |
-
parser.add_argument(
|
196 |
-
"--db_thresh",
|
197 |
-
type=float,
|
198 |
-
required=False,
|
199 |
-
default=-40,
|
200 |
-
help="The dB threshold for silence detection",
|
201 |
-
)
|
202 |
-
parser.add_argument(
|
203 |
-
"--min_length",
|
204 |
-
type=int,
|
205 |
-
required=False,
|
206 |
-
default=5000,
|
207 |
-
help="The minimum milliseconds required for each sliced audio clip",
|
208 |
-
)
|
209 |
-
parser.add_argument(
|
210 |
-
"--min_interval",
|
211 |
-
type=int,
|
212 |
-
required=False,
|
213 |
-
default=300,
|
214 |
-
help="The minimum milliseconds for a silence part to be sliced",
|
215 |
-
)
|
216 |
-
parser.add_argument(
|
217 |
-
"--hop_size",
|
218 |
-
type=int,
|
219 |
-
required=False,
|
220 |
-
default=10,
|
221 |
-
help="Frame length in milliseconds",
|
222 |
-
)
|
223 |
-
parser.add_argument(
|
224 |
-
"--max_sil_kept",
|
225 |
-
type=int,
|
226 |
-
required=False,
|
227 |
-
default=500,
|
228 |
-
help="The maximum silence length kept around the sliced clip, presented in milliseconds",
|
229 |
-
)
|
230 |
-
args = parser.parse_args()
|
231 |
-
out = args.out
|
232 |
-
if out is None:
|
233 |
-
out = os.path.dirname(os.path.abspath(args.audio))
|
234 |
-
audio, sr = librosa.load(args.audio, sr=None, mono=False)
|
235 |
-
slicer = Slicer(
|
236 |
-
sr=sr,
|
237 |
-
threshold=args.db_thresh,
|
238 |
-
min_length=args.min_length,
|
239 |
-
min_interval=args.min_interval,
|
240 |
-
hop_size=args.hop_size,
|
241 |
-
max_sil_kept=args.max_sil_kept,
|
242 |
-
)
|
243 |
-
chunks = slicer.slice(audio)
|
244 |
-
if not os.path.exists(out):
|
245 |
-
os.makedirs(out)
|
246 |
-
for i, chunk in enumerate(chunks):
|
247 |
-
if len(chunk.shape) > 1:
|
248 |
-
chunk = chunk.T
|
249 |
-
soundfile.write(
|
250 |
-
os.path.join(
|
251 |
-
out,
|
252 |
-
f"%s_%d.wav"
|
253 |
-
% (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i),
|
254 |
-
),
|
255 |
-
chunk,
|
256 |
-
sr,
|
257 |
-
)
|
258 |
-
|
259 |
-
|
260 |
-
if __name__ == "__main__":
|
261 |
-
main()
|
|
|
1 |
+
import numpy as np
|
2 |
+
|
3 |
+
|
4 |
+
# This function is obtained from librosa.
|
5 |
+
def get_rms(
|
6 |
+
y,
|
7 |
+
frame_length=2048,
|
8 |
+
hop_length=512,
|
9 |
+
pad_mode="constant",
|
10 |
+
):
|
11 |
+
padding = (int(frame_length // 2), int(frame_length // 2))
|
12 |
+
y = np.pad(y, padding, mode=pad_mode)
|
13 |
+
|
14 |
+
axis = -1
|
15 |
+
# put our new within-frame axis at the end for now
|
16 |
+
out_strides = y.strides + tuple([y.strides[axis]])
|
17 |
+
# Reduce the shape on the framing axis
|
18 |
+
x_shape_trimmed = list(y.shape)
|
19 |
+
x_shape_trimmed[axis] -= frame_length - 1
|
20 |
+
out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
|
21 |
+
xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides)
|
22 |
+
if axis < 0:
|
23 |
+
target_axis = axis - 1
|
24 |
+
else:
|
25 |
+
target_axis = axis + 1
|
26 |
+
xw = np.moveaxis(xw, -1, target_axis)
|
27 |
+
# Downsample along the target axis
|
28 |
+
slices = [slice(None)] * xw.ndim
|
29 |
+
slices[axis] = slice(0, None, hop_length)
|
30 |
+
x = xw[tuple(slices)]
|
31 |
+
|
32 |
+
# Calculate power
|
33 |
+
power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True)
|
34 |
+
|
35 |
+
return np.sqrt(power)
|
36 |
+
|
37 |
+
|
38 |
+
class Slicer:
|
39 |
+
def __init__(
|
40 |
+
self,
|
41 |
+
sr: int,
|
42 |
+
threshold: float = -40.0,
|
43 |
+
min_length: int = 5000,
|
44 |
+
min_interval: int = 300,
|
45 |
+
hop_size: int = 20,
|
46 |
+
max_sil_kept: int = 5000,
|
47 |
+
):
|
48 |
+
if not min_length >= min_interval >= hop_size:
|
49 |
+
raise ValueError(
|
50 |
+
"The following condition must be satisfied: min_length >= min_interval >= hop_size"
|
51 |
+
)
|
52 |
+
if not max_sil_kept >= hop_size:
|
53 |
+
raise ValueError(
|
54 |
+
"The following condition must be satisfied: max_sil_kept >= hop_size"
|
55 |
+
)
|
56 |
+
min_interval = sr * min_interval / 1000
|
57 |
+
self.threshold = 10 ** (threshold / 20.0)
|
58 |
+
self.hop_size = round(sr * hop_size / 1000)
|
59 |
+
self.win_size = min(round(min_interval), 4 * self.hop_size)
|
60 |
+
self.min_length = round(sr * min_length / 1000 / self.hop_size)
|
61 |
+
self.min_interval = round(min_interval / self.hop_size)
|
62 |
+
self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
|
63 |
+
|
64 |
+
def _apply_slice(self, waveform, begin, end):
|
65 |
+
if len(waveform.shape) > 1:
|
66 |
+
return waveform[
|
67 |
+
:, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size)
|
68 |
+
]
|
69 |
+
else:
|
70 |
+
return waveform[
|
71 |
+
begin * self.hop_size : min(waveform.shape[0], end * self.hop_size)
|
72 |
+
]
|
73 |
+
|
74 |
+
# @timeit
|
75 |
+
def slice(self, waveform):
|
76 |
+
if len(waveform.shape) > 1:
|
77 |
+
samples = waveform.mean(axis=0)
|
78 |
+
else:
|
79 |
+
samples = waveform
|
80 |
+
if samples.shape[0] <= self.min_length:
|
81 |
+
return [waveform]
|
82 |
+
rms_list = get_rms(
|
83 |
+
y=samples, frame_length=self.win_size, hop_length=self.hop_size
|
84 |
+
).squeeze(0)
|
85 |
+
sil_tags = []
|
86 |
+
silence_start = None
|
87 |
+
clip_start = 0
|
88 |
+
for i, rms in enumerate(rms_list):
|
89 |
+
# Keep looping while frame is silent.
|
90 |
+
if rms < self.threshold:
|
91 |
+
# Record start of silent frames.
|
92 |
+
if silence_start is None:
|
93 |
+
silence_start = i
|
94 |
+
continue
|
95 |
+
# Keep looping while frame is not silent and silence start has not been recorded.
|
96 |
+
if silence_start is None:
|
97 |
+
continue
|
98 |
+
# Clear recorded silence start if interval is not enough or clip is too short
|
99 |
+
is_leading_silence = silence_start == 0 and i > self.max_sil_kept
|
100 |
+
need_slice_middle = (
|
101 |
+
i - silence_start >= self.min_interval
|
102 |
+
and i - clip_start >= self.min_length
|
103 |
+
)
|
104 |
+
if not is_leading_silence and not need_slice_middle:
|
105 |
+
silence_start = None
|
106 |
+
continue
|
107 |
+
# Need slicing. Record the range of silent frames to be removed.
|
108 |
+
if i - silence_start <= self.max_sil_kept:
|
109 |
+
pos = rms_list[silence_start : i + 1].argmin() + silence_start
|
110 |
+
if silence_start == 0:
|
111 |
+
sil_tags.append((0, pos))
|
112 |
+
else:
|
113 |
+
sil_tags.append((pos, pos))
|
114 |
+
clip_start = pos
|
115 |
+
elif i - silence_start <= self.max_sil_kept * 2:
|
116 |
+
pos = rms_list[
|
117 |
+
i - self.max_sil_kept : silence_start + self.max_sil_kept + 1
|
118 |
+
].argmin()
|
119 |
+
pos += i - self.max_sil_kept
|
120 |
+
pos_l = (
|
121 |
+
rms_list[
|
122 |
+
silence_start : silence_start + self.max_sil_kept + 1
|
123 |
+
].argmin()
|
124 |
+
+ silence_start
|
125 |
+
)
|
126 |
+
pos_r = (
|
127 |
+
rms_list[i - self.max_sil_kept : i + 1].argmin()
|
128 |
+
+ i
|
129 |
+
- self.max_sil_kept
|
130 |
+
)
|
131 |
+
if silence_start == 0:
|
132 |
+
sil_tags.append((0, pos_r))
|
133 |
+
clip_start = pos_r
|
134 |
+
else:
|
135 |
+
sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
|
136 |
+
clip_start = max(pos_r, pos)
|
137 |
+
else:
|
138 |
+
pos_l = (
|
139 |
+
rms_list[
|
140 |
+
silence_start : silence_start + self.max_sil_kept + 1
|
141 |
+
].argmin()
|
142 |
+
+ silence_start
|
143 |
+
)
|
144 |
+
pos_r = (
|
145 |
+
rms_list[i - self.max_sil_kept : i + 1].argmin()
|
146 |
+
+ i
|
147 |
+
- self.max_sil_kept
|
148 |
+
)
|
149 |
+
if silence_start == 0:
|
150 |
+
sil_tags.append((0, pos_r))
|
151 |
+
else:
|
152 |
+
sil_tags.append((pos_l, pos_r))
|
153 |
+
clip_start = pos_r
|
154 |
+
silence_start = None
|
155 |
+
# Deal with trailing silence.
|
156 |
+
total_frames = rms_list.shape[0]
|
157 |
+
if (
|
158 |
+
silence_start is not None
|
159 |
+
and total_frames - silence_start >= self.min_interval
|
160 |
+
):
|
161 |
+
silence_end = min(total_frames, silence_start + self.max_sil_kept)
|
162 |
+
pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start
|
163 |
+
sil_tags.append((pos, total_frames + 1))
|
164 |
+
# Apply and return slices.
|
165 |
+
####音频+起始时间+终止时间
|
166 |
+
if len(sil_tags) == 0:
|
167 |
+
return [[waveform,0,int(total_frames*self.hop_size)]]
|
168 |
+
else:
|
169 |
+
chunks = []
|
170 |
+
if sil_tags[0][0] > 0:
|
171 |
+
chunks.append([self._apply_slice(waveform, 0, sil_tags[0][0]),0,int(sil_tags[0][0]*self.hop_size)])
|
172 |
+
for i in range(len(sil_tags) - 1):
|
173 |
+
chunks.append(
|
174 |
+
[self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]),int(sil_tags[i][1]*self.hop_size),int(sil_tags[i + 1][0]*self.hop_size)]
|
175 |
+
)
|
176 |
+
if sil_tags[-1][1] < total_frames:
|
177 |
+
chunks.append(
|
178 |
+
[self._apply_slice(waveform, sil_tags[-1][1], total_frames),int(sil_tags[-1][1]*self.hop_size),int(total_frames*self.hop_size)]
|
179 |
+
)
|
180 |
+
return chunks
|
181 |
+
|
182 |
+
|
183 |
+
def main():
|
184 |
+
import os.path
|
185 |
+
from argparse import ArgumentParser
|
186 |
+
|
187 |
+
import librosa
|
188 |
+
import soundfile
|
189 |
+
|
190 |
+
parser = ArgumentParser()
|
191 |
+
parser.add_argument("audio", type=str, help="The audio to be sliced")
|
192 |
+
parser.add_argument(
|
193 |
+
"--out", type=str, help="Output directory of the sliced audio clips"
|
194 |
+
)
|
195 |
+
parser.add_argument(
|
196 |
+
"--db_thresh",
|
197 |
+
type=float,
|
198 |
+
required=False,
|
199 |
+
default=-40,
|
200 |
+
help="The dB threshold for silence detection",
|
201 |
+
)
|
202 |
+
parser.add_argument(
|
203 |
+
"--min_length",
|
204 |
+
type=int,
|
205 |
+
required=False,
|
206 |
+
default=5000,
|
207 |
+
help="The minimum milliseconds required for each sliced audio clip",
|
208 |
+
)
|
209 |
+
parser.add_argument(
|
210 |
+
"--min_interval",
|
211 |
+
type=int,
|
212 |
+
required=False,
|
213 |
+
default=300,
|
214 |
+
help="The minimum milliseconds for a silence part to be sliced",
|
215 |
+
)
|
216 |
+
parser.add_argument(
|
217 |
+
"--hop_size",
|
218 |
+
type=int,
|
219 |
+
required=False,
|
220 |
+
default=10,
|
221 |
+
help="Frame length in milliseconds",
|
222 |
+
)
|
223 |
+
parser.add_argument(
|
224 |
+
"--max_sil_kept",
|
225 |
+
type=int,
|
226 |
+
required=False,
|
227 |
+
default=500,
|
228 |
+
help="The maximum silence length kept around the sliced clip, presented in milliseconds",
|
229 |
+
)
|
230 |
+
args = parser.parse_args()
|
231 |
+
out = args.out
|
232 |
+
if out is None:
|
233 |
+
out = os.path.dirname(os.path.abspath(args.audio))
|
234 |
+
audio, sr = librosa.load(args.audio, sr=None, mono=False)
|
235 |
+
slicer = Slicer(
|
236 |
+
sr=sr,
|
237 |
+
threshold=args.db_thresh,
|
238 |
+
min_length=args.min_length,
|
239 |
+
min_interval=args.min_interval,
|
240 |
+
hop_size=args.hop_size,
|
241 |
+
max_sil_kept=args.max_sil_kept,
|
242 |
+
)
|
243 |
+
chunks = slicer.slice(audio)
|
244 |
+
if not os.path.exists(out):
|
245 |
+
os.makedirs(out)
|
246 |
+
for i, chunk in enumerate(chunks):
|
247 |
+
if len(chunk.shape) > 1:
|
248 |
+
chunk = chunk.T
|
249 |
+
soundfile.write(
|
250 |
+
os.path.join(
|
251 |
+
out,
|
252 |
+
f"%s_%d.wav"
|
253 |
+
% (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i),
|
254 |
+
),
|
255 |
+
chunk,
|
256 |
+
sr,
|
257 |
+
)
|
258 |
+
|
259 |
+
|
260 |
+
if __name__ == "__main__":
|
261 |
+
main()
|
tools/subfix_webui.py
CHANGED
@@ -493,6 +493,6 @@ if __name__ == "__main__":
|
|
493 |
server_name="0.0.0.0",
|
494 |
inbrowser=True,
|
495 |
quiet=True,
|
496 |
-
share=
|
497 |
server_port=int(args.webui_port_subfix)
|
498 |
-
)
|
|
|
493 |
server_name="0.0.0.0",
|
494 |
inbrowser=True,
|
495 |
quiet=True,
|
496 |
+
share=eval(args.is_share),
|
497 |
server_port=int(args.webui_port_subfix)
|
498 |
+
)
|
tools/uvr5/lib/lib_v5/modelparams/4band_v3.json
CHANGED
@@ -1,54 +1,54 @@
|
|
1 |
-
{
|
2 |
-
"bins": 672,
|
3 |
-
"unstable_bins": 8,
|
4 |
-
"reduction_bins": 530,
|
5 |
-
"band": {
|
6 |
-
"1": {
|
7 |
-
"sr": 7350,
|
8 |
-
"hl": 80,
|
9 |
-
"n_fft": 640,
|
10 |
-
"crop_start": 0,
|
11 |
-
"crop_stop": 85,
|
12 |
-
"lpf_start": 25,
|
13 |
-
"lpf_stop": 53,
|
14 |
-
"res_type": "polyphase"
|
15 |
-
},
|
16 |
-
"2": {
|
17 |
-
"sr": 7350,
|
18 |
-
"hl": 80,
|
19 |
-
"n_fft": 320,
|
20 |
-
"crop_start": 4,
|
21 |
-
"crop_stop": 87,
|
22 |
-
"hpf_start": 25,
|
23 |
-
"hpf_stop": 12,
|
24 |
-
"lpf_start": 31,
|
25 |
-
"lpf_stop": 62,
|
26 |
-
"res_type": "polyphase"
|
27 |
-
},
|
28 |
-
"3": {
|
29 |
-
"sr": 14700,
|
30 |
-
"hl": 160,
|
31 |
-
"n_fft": 512,
|
32 |
-
"crop_start": 17,
|
33 |
-
"crop_stop": 216,
|
34 |
-
"hpf_start": 48,
|
35 |
-
"hpf_stop": 24,
|
36 |
-
"lpf_start": 139,
|
37 |
-
"lpf_stop": 210,
|
38 |
-
"res_type": "polyphase"
|
39 |
-
},
|
40 |
-
"4": {
|
41 |
-
"sr": 44100,
|
42 |
-
"hl": 480,
|
43 |
-
"n_fft": 960,
|
44 |
-
"crop_start": 78,
|
45 |
-
"crop_stop": 383,
|
46 |
-
"hpf_start": 130,
|
47 |
-
"hpf_stop": 86,
|
48 |
-
"res_type": "kaiser_fast"
|
49 |
-
}
|
50 |
-
},
|
51 |
-
"sr": 44100,
|
52 |
-
"pre_filter_start": 668,
|
53 |
-
"pre_filter_stop": 672
|
54 |
}
|
|
|
1 |
+
{
|
2 |
+
"bins": 672,
|
3 |
+
"unstable_bins": 8,
|
4 |
+
"reduction_bins": 530,
|
5 |
+
"band": {
|
6 |
+
"1": {
|
7 |
+
"sr": 7350,
|
8 |
+
"hl": 80,
|
9 |
+
"n_fft": 640,
|
10 |
+
"crop_start": 0,
|
11 |
+
"crop_stop": 85,
|
12 |
+
"lpf_start": 25,
|
13 |
+
"lpf_stop": 53,
|
14 |
+
"res_type": "polyphase"
|
15 |
+
},
|
16 |
+
"2": {
|
17 |
+
"sr": 7350,
|
18 |
+
"hl": 80,
|
19 |
+
"n_fft": 320,
|
20 |
+
"crop_start": 4,
|
21 |
+
"crop_stop": 87,
|
22 |
+
"hpf_start": 25,
|
23 |
+
"hpf_stop": 12,
|
24 |
+
"lpf_start": 31,
|
25 |
+
"lpf_stop": 62,
|
26 |
+
"res_type": "polyphase"
|
27 |
+
},
|
28 |
+
"3": {
|
29 |
+
"sr": 14700,
|
30 |
+
"hl": 160,
|
31 |
+
"n_fft": 512,
|
32 |
+
"crop_start": 17,
|
33 |
+
"crop_stop": 216,
|
34 |
+
"hpf_start": 48,
|
35 |
+
"hpf_stop": 24,
|
36 |
+
"lpf_start": 139,
|
37 |
+
"lpf_stop": 210,
|
38 |
+
"res_type": "polyphase"
|
39 |
+
},
|
40 |
+
"4": {
|
41 |
+
"sr": 44100,
|
42 |
+
"hl": 480,
|
43 |
+
"n_fft": 960,
|
44 |
+
"crop_start": 78,
|
45 |
+
"crop_stop": 383,
|
46 |
+
"hpf_start": 130,
|
47 |
+
"hpf_stop": 86,
|
48 |
+
"res_type": "kaiser_fast"
|
49 |
+
}
|
50 |
+
},
|
51 |
+
"sr": 44100,
|
52 |
+
"pre_filter_start": 668,
|
53 |
+
"pre_filter_stop": 672
|
54 |
}
|
tools/uvr5/webui.py
CHANGED
@@ -73,8 +73,7 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format
|
|
73 |
os.path.basename(inp_path),
|
74 |
)
|
75 |
os.system(
|
76 |
-
|
77 |
-
% (inp_path, tmp_path)
|
78 |
)
|
79 |
inp_path = tmp_path
|
80 |
try:
|
|
|
73 |
os.path.basename(inp_path),
|
74 |
)
|
75 |
os.system(
|
76 |
+
f'ffmpeg -i "{inp_path}" -vn -acodec pcm_s16le -ac 2 -ar 44100 "{tmp_path}" -y'
|
|
|
77 |
)
|
78 |
inp_path = tmp_path
|
79 |
try:
|
webui.py
CHANGED
@@ -1,878 +1,878 @@
|
|
1 |
-
import os,shutil,sys,pdb,re
|
2 |
-
now_dir = os.getcwd()
|
3 |
-
sys.path.insert(0, now_dir)
|
4 |
-
import json,yaml,warnings,torch
|
5 |
-
import platform
|
6 |
-
import psutil
|
7 |
-
import signal
|
8 |
-
|
9 |
-
warnings.filterwarnings("ignore")
|
10 |
-
torch.manual_seed(233333)
|
11 |
-
tmp = os.path.join(now_dir, "TEMP")
|
12 |
-
os.makedirs(tmp, exist_ok=True)
|
13 |
-
os.environ["TEMP"] = tmp
|
14 |
-
if(os.path.exists(tmp)):
|
15 |
-
for name in os.listdir(tmp):
|
16 |
-
if(name=="jieba.cache"):continue
|
17 |
-
path="%s/%s"%(tmp,name)
|
18 |
-
delete=os.remove if os.path.isfile(path) else shutil.rmtree
|
19 |
-
try:
|
20 |
-
delete(path)
|
21 |
-
except Exception as e:
|
22 |
-
print(str(e))
|
23 |
-
pass
|
24 |
-
import site
|
25 |
-
site_packages_roots = []
|
26 |
-
for path in site.getsitepackages():
|
27 |
-
if "packages" in path:
|
28 |
-
site_packages_roots.append(path)
|
29 |
-
if(site_packages_roots==[]):site_packages_roots=["%s/runtime/Lib/site-packages" % now_dir]
|
30 |
-
#os.environ["OPENBLAS_NUM_THREADS"] = "4"
|
31 |
-
os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1"
|
32 |
-
os.environ["all_proxy"] = ""
|
33 |
-
for site_packages_root in site_packages_roots:
|
34 |
-
if os.path.exists(site_packages_root):
|
35 |
-
try:
|
36 |
-
with open("%s/users.pth" % (site_packages_root), "w") as f:
|
37 |
-
f.write(
|
38 |
-
"%s\n%s/tools\n%s/tools/damo_asr\n%s/GPT_SoVITS\n%s/tools/uvr5"
|
39 |
-
% (now_dir, now_dir, now_dir, now_dir, now_dir)
|
40 |
-
)
|
41 |
-
break
|
42 |
-
except PermissionError:
|
43 |
-
pass
|
44 |
-
from tools import my_utils
|
45 |
-
import traceback
|
46 |
-
import shutil
|
47 |
-
import pdb
|
48 |
-
import gradio as gr
|
49 |
-
from subprocess import Popen
|
50 |
-
import signal
|
51 |
-
from config import python_exec,infer_device,is_half,exp_root,webui_port_main,webui_port_infer_tts,webui_port_uvr5,webui_port_subfix,is_share
|
52 |
-
from tools.i18n.i18n import I18nAuto
|
53 |
-
i18n = I18nAuto()
|
54 |
-
from scipy.io import wavfile
|
55 |
-
from tools.my_utils import load_audio
|
56 |
-
from multiprocessing import cpu_count
|
57 |
-
|
58 |
-
# os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 当遇到mps不支持的步骤时使用cpu
|
59 |
-
|
60 |
-
n_cpu=cpu_count()
|
61 |
-
|
62 |
-
ngpu = torch.cuda.device_count()
|
63 |
-
gpu_infos = []
|
64 |
-
mem = []
|
65 |
-
if_gpu_ok = False
|
66 |
-
|
67 |
-
# 判断是否有能用来训练和加速推理的N卡
|
68 |
-
if torch.cuda.is_available() or ngpu != 0:
|
69 |
-
for i in range(ngpu):
|
70 |
-
gpu_name = torch.cuda.get_device_name(i)
|
71 |
-
if any(value in gpu_name.upper()for value in ["10","16","20","30","40","A2","A3","A4","P4","A50","500","A60","70","80","90","M4","T4","TITAN","L4","4060"]):
|
72 |
-
# A10#A100#V100#A40#P40#M40#K80#A4500
|
73 |
-
if_gpu_ok = True # 至少有一张能用的N卡
|
74 |
-
gpu_infos.append("%s\t%s" % (i, gpu_name))
|
75 |
-
mem.append(int(torch.cuda.get_device_properties(i).total_memory/ 1024/ 1024/ 1024+ 0.4))
|
76 |
-
# # 判断是否支持mps加速
|
77 |
-
# if torch.backends.mps.is_available():
|
78 |
-
# if_gpu_ok = True
|
79 |
-
# gpu_infos.append("%s\t%s" % ("0", "Apple GPU"))
|
80 |
-
# mem.append(psutil.virtual_memory().total/ 1024 / 1024 / 1024) # 实测使用系统内存作为显存不会爆显存
|
81 |
-
|
82 |
-
if if_gpu_ok and len(gpu_infos) > 0:
|
83 |
-
gpu_info = "\n".join(gpu_infos)
|
84 |
-
default_batch_size = min(mem) // 2
|
85 |
-
else:
|
86 |
-
gpu_info = ("%s\t%s" % ("0", "CPU"))
|
87 |
-
gpu_infos.append("%s\t%s" % ("0", "CPU"))
|
88 |
-
default_batch_size = psutil.virtual_memory().total/ 1024 / 1024 / 1024 / 2
|
89 |
-
gpus = "-".join([i[0] for i in gpu_infos])
|
90 |
-
|
91 |
-
pretrained_sovits_name="GPT_SoVITS/pretrained_models/s2G488k.pth"
|
92 |
-
pretrained_gpt_name="GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
|
93 |
-
def get_weights_names():
|
94 |
-
SoVITS_names = [pretrained_sovits_name]
|
95 |
-
for name in os.listdir(SoVITS_weight_root):
|
96 |
-
if name.endswith(".pth"):SoVITS_names.append(name)
|
97 |
-
GPT_names = [pretrained_gpt_name]
|
98 |
-
for name in os.listdir(GPT_weight_root):
|
99 |
-
if name.endswith(".ckpt"): GPT_names.append(name)
|
100 |
-
return SoVITS_names,GPT_names
|
101 |
-
SoVITS_weight_root="SoVITS_weights"
|
102 |
-
GPT_weight_root="GPT_weights"
|
103 |
-
os.makedirs(SoVITS_weight_root,exist_ok=True)
|
104 |
-
os.makedirs(GPT_weight_root,exist_ok=True)
|
105 |
-
SoVITS_names,GPT_names = get_weights_names()
|
106 |
-
|
107 |
-
def custom_sort_key(s):
|
108 |
-
# 使用正则表达式提取字符串中的数字部分和非数字部分
|
109 |
-
parts = re.split('(\d+)', s)
|
110 |
-
# 将数字部分转换为整数,非数字部分保持不变
|
111 |
-
parts = [int(part) if part.isdigit() else part for part in parts]
|
112 |
-
return parts
|
113 |
-
|
114 |
-
def change_choices():
|
115 |
-
SoVITS_names, GPT_names = get_weights_names()
|
116 |
-
return {"choices": sorted(SoVITS_names,key=custom_sort_key), "__type__": "update"}, {"choices": sorted(GPT_names,key=custom_sort_key), "__type__": "update"}
|
117 |
-
|
118 |
-
p_label=None
|
119 |
-
p_uvr5=None
|
120 |
-
p_asr=None
|
121 |
-
p_denoise=None
|
122 |
-
p_tts_inference=None
|
123 |
-
|
124 |
-
def kill_proc_tree(pid, including_parent=True):
|
125 |
-
try:
|
126 |
-
parent = psutil.Process(pid)
|
127 |
-
except psutil.NoSuchProcess:
|
128 |
-
# Process already terminated
|
129 |
-
return
|
130 |
-
|
131 |
-
children = parent.children(recursive=True)
|
132 |
-
for child in children:
|
133 |
-
try:
|
134 |
-
os.kill(child.pid, signal.SIGTERM) # or signal.SIGKILL
|
135 |
-
except OSError:
|
136 |
-
pass
|
137 |
-
if including_parent:
|
138 |
-
try:
|
139 |
-
os.kill(parent.pid, signal.SIGTERM) # or signal.SIGKILL
|
140 |
-
except OSError:
|
141 |
-
pass
|
142 |
-
|
143 |
-
system=platform.system()
|
144 |
-
def kill_process(pid):
|
145 |
-
if(system=="Windows"):
|
146 |
-
cmd = "taskkill /t /f /pid %s" % pid
|
147 |
-
os.system(cmd)
|
148 |
-
else:
|
149 |
-
kill_proc_tree(pid)
|
150 |
-
|
151 |
-
|
152 |
-
def change_label(if_label,path_list):
|
153 |
-
global p_label
|
154 |
-
if(if_label==True and p_label==None):
|
155 |
-
path_list=my_utils.clean_path(path_list)
|
156 |
-
cmd = '"%s" tools/subfix_webui.py --load_list "%s" --webui_port %s --is_share %s'%(python_exec,path_list,webui_port_subfix,is_share)
|
157 |
-
yield i18n("打标工具WebUI已开启")
|
158 |
-
print(cmd)
|
159 |
-
p_label = Popen(cmd, shell=True)
|
160 |
-
elif(if_label==False and p_label!=None):
|
161 |
-
kill_process(p_label.pid)
|
162 |
-
p_label=None
|
163 |
-
yield i18n("打标工具WebUI已关闭")
|
164 |
-
|
165 |
-
def change_uvr5(if_uvr5):
|
166 |
-
global p_uvr5
|
167 |
-
if(if_uvr5==True and p_uvr5==None):
|
168 |
-
cmd = '"%s" tools/uvr5/webui.py "%s" %s %s %s'%(python_exec,infer_device,is_half,webui_port_uvr5,is_share)
|
169 |
-
yield i18n("UVR5已开启")
|
170 |
-
print(cmd)
|
171 |
-
p_uvr5 = Popen(cmd, shell=True)
|
172 |
-
elif(if_uvr5==False and p_uvr5!=None):
|
173 |
-
kill_process(p_uvr5.pid)
|
174 |
-
p_uvr5=None
|
175 |
-
yield i18n("UVR5已关闭")
|
176 |
-
|
177 |
-
def change_tts_inference(if_tts,bert_path,cnhubert_base_path,gpu_number,gpt_path,sovits_path):
|
178 |
-
global p_tts_inference
|
179 |
-
if(if_tts==True and p_tts_inference==None):
|
180 |
-
os.environ["gpt_path"]=gpt_path if "/" in gpt_path else "%s/%s"%(GPT_weight_root,gpt_path)
|
181 |
-
os.environ["sovits_path"]=sovits_path if "/"in sovits_path else "%s/%s"%(SoVITS_weight_root,sovits_path)
|
182 |
-
os.environ["cnhubert_base_path"]=cnhubert_base_path
|
183 |
-
os.environ["bert_path"]=bert_path
|
184 |
-
os.environ["_CUDA_VISIBLE_DEVICES"]=gpu_number
|
185 |
-
os.environ["is_half"]=str(is_half)
|
186 |
-
os.environ["infer_ttswebui"]=str(webui_port_infer_tts)
|
187 |
-
os.environ["is_share"]=str(is_share)
|
188 |
-
cmd = '"%s" GPT_SoVITS/inference_webui.py'%(python_exec)
|
189 |
-
yield i18n("TTS推理进程已开启")
|
190 |
-
print(cmd)
|
191 |
-
p_tts_inference = Popen(cmd, shell=True)
|
192 |
-
elif(if_tts==False and p_tts_inference!=None):
|
193 |
-
kill_process(p_tts_inference.pid)
|
194 |
-
p_tts_inference=None
|
195 |
-
yield i18n("TTS推理进程已关闭")
|
196 |
-
|
197 |
-
from tools.asr.config import asr_dict
|
198 |
-
def open_asr(asr_inp_dir, asr_opt_dir, asr_model, asr_model_size, asr_lang):
|
199 |
-
global p_asr
|
200 |
-
if(p_asr==None):
|
201 |
-
asr_inp_dir=my_utils.clean_path(asr_inp_dir)
|
202 |
-
cmd = f'"{python_exec}" tools/asr/{asr_dict[asr_model]["path"]}'
|
203 |
-
cmd += f' -i "{asr_inp_dir}"'
|
204 |
-
cmd += f' -o "{asr_opt_dir}"'
|
205 |
-
cmd += f' -s {asr_model_size}'
|
206 |
-
cmd += f' -l {asr_lang}'
|
207 |
-
cmd += " -p %s"%("float16"if is_half==True else "float32")
|
208 |
-
|
209 |
-
yield "ASR任务开启:%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True}
|
210 |
-
print(cmd)
|
211 |
-
p_asr = Popen(cmd, shell=True)
|
212 |
-
p_asr.wait()
|
213 |
-
p_asr=None
|
214 |
-
yield f"ASR任务完成, 查看终端进行下一步",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
|
215 |
-
else:
|
216 |
-
yield "已有正在进行的ASR任务,需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True}
|
217 |
-
# return None
|
218 |
-
|
219 |
-
def close_asr():
|
220 |
-
global p_asr
|
221 |
-
if(p_asr!=None):
|
222 |
-
kill_process(p_asr.pid)
|
223 |
-
p_asr=None
|
224 |
-
return "已终止ASR进程",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
|
225 |
-
def open_denoise(denoise_inp_dir, denoise_opt_dir):
|
226 |
-
global p_denoise
|
227 |
-
if(p_denoise==None):
|
228 |
-
denoise_inp_dir=my_utils.clean_path(denoise_inp_dir)
|
229 |
-
denoise_opt_dir=my_utils.clean_path(denoise_opt_dir)
|
230 |
-
cmd = '"%s" tools/cmd-denoise.py -i "%s" -o "%s" -p %s'%(python_exec,denoise_inp_dir,denoise_opt_dir,"float16"if is_half==True else "float32")
|
231 |
-
|
232 |
-
yield "语音降噪任务开启:%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True}
|
233 |
-
print(cmd)
|
234 |
-
p_denoise = Popen(cmd, shell=True)
|
235 |
-
p_denoise.wait()
|
236 |
-
p_denoise=None
|
237 |
-
yield f"语音降噪任务完成, 查看终端进行下一步",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
|
238 |
-
else:
|
239 |
-
yield "已有正在进行的语音降噪任务,需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True}
|
240 |
-
# return None
|
241 |
-
|
242 |
-
def close_denoise():
|
243 |
-
global p_denoise
|
244 |
-
if(p_denoise!=None):
|
245 |
-
kill_process(p_denoise.pid)
|
246 |
-
p_denoise=None
|
247 |
-
return "已终止语音降噪进程",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
|
248 |
-
|
249 |
-
p_train_SoVITS=None
|
250 |
-
def open1Ba(batch_size,total_epoch,exp_name,text_low_lr_rate,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers1Ba,pretrained_s2G,pretrained_s2D):
|
251 |
-
global p_train_SoVITS
|
252 |
-
if(p_train_SoVITS==None):
|
253 |
-
with open("GPT_SoVITS/configs/s2.json")as f:
|
254 |
-
data=f.read()
|
255 |
-
data=json.loads(data)
|
256 |
-
s2_dir="%s/%s"%(exp_root,exp_name)
|
257 |
-
os.makedirs("%s/logs_s2"%(s2_dir),exist_ok=True)
|
258 |
-
if(is_half==False):
|
259 |
-
data["train"]["fp16_run"]=False
|
260 |
-
batch_size=max(1,batch_size//2)
|
261 |
-
data["train"]["batch_size"]=batch_size
|
262 |
-
data["train"]["epochs"]=total_epoch
|
263 |
-
data["train"]["text_low_lr_rate"]=text_low_lr_rate
|
264 |
-
data["train"]["pretrained_s2G"]=pretrained_s2G
|
265 |
-
data["train"]["pretrained_s2D"]=pretrained_s2D
|
266 |
-
data["train"]["if_save_latest"]=if_save_latest
|
267 |
-
data["train"]["if_save_every_weights"]=if_save_every_weights
|
268 |
-
data["train"]["save_every_epoch"]=save_every_epoch
|
269 |
-
data["train"]["gpu_numbers"]=gpu_numbers1Ba
|
270 |
-
data["data"]["exp_dir"]=data["s2_ckpt_dir"]=s2_dir
|
271 |
-
data["save_weight_dir"]=SoVITS_weight_root
|
272 |
-
data["name"]=exp_name
|
273 |
-
tmp_config_path="%s/tmp_s2.json"%tmp
|
274 |
-
with open(tmp_config_path,"w")as f:f.write(json.dumps(data))
|
275 |
-
|
276 |
-
cmd = '"%s" GPT_SoVITS/s2_train.py --config "%s"'%(python_exec,tmp_config_path)
|
277 |
-
yield "SoVITS训练开始:%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True}
|
278 |
-
print(cmd)
|
279 |
-
p_train_SoVITS = Popen(cmd, shell=True)
|
280 |
-
p_train_SoVITS.wait()
|
281 |
-
p_train_SoVITS=None
|
282 |
-
yield "SoVITS训练完成",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
|
283 |
-
else:
|
284 |
-
yield "已有正在进行的SoVITS训练任务,需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True}
|
285 |
-
|
286 |
-
def close1Ba():
|
287 |
-
global p_train_SoVITS
|
288 |
-
if(p_train_SoVITS!=None):
|
289 |
-
kill_process(p_train_SoVITS.pid)
|
290 |
-
p_train_SoVITS=None
|
291 |
-
return "已终止SoVITS训练",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
|
292 |
-
|
293 |
-
p_train_GPT=None
|
294 |
-
def open1Bb(batch_size,total_epoch,exp_name,if_dpo,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers,pretrained_s1):
|
295 |
-
global p_train_GPT
|
296 |
-
if(p_train_GPT==None):
|
297 |
-
with open("GPT_SoVITS/configs/s1longer.yaml")as f:
|
298 |
-
data=f.read()
|
299 |
-
data=yaml.load(data, Loader=yaml.FullLoader)
|
300 |
-
s1_dir="%s/%s"%(exp_root,exp_name)
|
301 |
-
os.makedirs("%s/logs_s1"%(s1_dir),exist_ok=True)
|
302 |
-
if(is_half==False):
|
303 |
-
data["train"]["precision"]="32"
|
304 |
-
batch_size = max(1, batch_size // 2)
|
305 |
-
data["train"]["batch_size"]=batch_size
|
306 |
-
data["train"]["epochs"]=total_epoch
|
307 |
-
data["pretrained_s1"]=pretrained_s1
|
308 |
-
data["train"]["save_every_n_epoch"]=save_every_epoch
|
309 |
-
data["train"]["if_save_every_weights"]=if_save_every_weights
|
310 |
-
data["train"]["if_save_latest"]=if_save_latest
|
311 |
-
data["train"]["if_dpo"]=if_dpo
|
312 |
-
data["train"]["half_weights_save_dir"]=GPT_weight_root
|
313 |
-
data["train"]["exp_name"]=exp_name
|
314 |
-
data["train_semantic_path"]="%s/6-name2semantic.tsv"%s1_dir
|
315 |
-
data["train_phoneme_path"]="%s/2-name2text.txt"%s1_dir
|
316 |
-
data["output_dir"]="%s/logs_s1"%s1_dir
|
317 |
-
|
318 |
-
os.environ["_CUDA_VISIBLE_DEVICES"]=gpu_numbers.replace("-",",")
|
319 |
-
os.environ["hz"]="25hz"
|
320 |
-
tmp_config_path="%s/tmp_s1.yaml"%tmp
|
321 |
-
with open(tmp_config_path, "w") as f:f.write(yaml.dump(data, default_flow_style=False))
|
322 |
-
# cmd = '"%s" GPT_SoVITS/s1_train.py --config_file "%s" --train_semantic_path "%s/6-name2semantic.tsv" --train_phoneme_path "%s/2-name2text.txt" --output_dir "%s/logs_s1"'%(python_exec,tmp_config_path,s1_dir,s1_dir,s1_dir)
|
323 |
-
cmd = '"%s" GPT_SoVITS/s1_train.py --config_file "%s" '%(python_exec,tmp_config_path)
|
324 |
-
yield "GPT训练开始:%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True}
|
325 |
-
print(cmd)
|
326 |
-
p_train_GPT = Popen(cmd, shell=True)
|
327 |
-
p_train_GPT.wait()
|
328 |
-
p_train_GPT=None
|
329 |
-
yield "GPT训练完成",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
|
330 |
-
else:
|
331 |
-
yield "已有正在进行的GPT训练任务,需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True}
|
332 |
-
|
333 |
-
def close1Bb():
|
334 |
-
global p_train_GPT
|
335 |
-
if(p_train_GPT!=None):
|
336 |
-
kill_process(p_train_GPT.pid)
|
337 |
-
p_train_GPT=None
|
338 |
-
return "已终止GPT训练",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
|
339 |
-
|
340 |
-
ps_slice=[]
|
341 |
-
def open_slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,n_parts):
|
342 |
-
global ps_slice
|
343 |
-
inp = my_utils.clean_path(inp)
|
344 |
-
opt_root = my_utils.clean_path(opt_root)
|
345 |
-
if(os.path.exists(inp)==False):
|
346 |
-
yield "输入路径不存在",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
|
347 |
-
return
|
348 |
-
if os.path.isfile(inp):n_parts=1
|
349 |
-
elif os.path.isdir(inp):pass
|
350 |
-
else:
|
351 |
-
yield "
|
352 |
-
return
|
353 |
-
if (ps_slice == []):
|
354 |
-
for i_part in range(n_parts):
|
355 |
-
cmd = '"%s" tools/slice_audio.py "%s" "%s" %s %s %s %s %s %s %s %s %s''' % (python_exec,inp, opt_root, threshold, min_length, min_interval, hop_size, max_sil_kept, _max, alpha, i_part, n_parts)
|
356 |
-
print(cmd)
|
357 |
-
p = Popen(cmd, shell=True)
|
358 |
-
ps_slice.append(p)
|
359 |
-
yield "切割执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
|
360 |
-
for p in ps_slice:
|
361 |
-
p.wait()
|
362 |
-
ps_slice=[]
|
363 |
-
yield "切割结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
|
364 |
-
else:
|
365 |
-
yield "已有正在进行的切割任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
|
366 |
-
|
367 |
-
def close_slice():
|
368 |
-
global ps_slice
|
369 |
-
if (ps_slice != []):
|
370 |
-
for p_slice in ps_slice:
|
371 |
-
try:
|
372 |
-
kill_process(p_slice.pid)
|
373 |
-
except:
|
374 |
-
traceback.print_exc()
|
375 |
-
ps_slice=[]
|
376 |
-
return "已终止所有切割进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
|
377 |
-
|
378 |
-
ps1a=[]
|
379 |
-
def open1a(inp_text,inp_wav_dir,exp_name,gpu_numbers,bert_pretrained_dir):
|
380 |
-
global ps1a
|
381 |
-
inp_text = my_utils.clean_path(inp_text)
|
382 |
-
inp_wav_dir = my_utils.clean_path(inp_wav_dir)
|
383 |
-
if (ps1a == []):
|
384 |
-
opt_dir="%s/%s"%(exp_root,exp_name)
|
385 |
-
config={
|
386 |
-
"inp_text":inp_text,
|
387 |
-
"inp_wav_dir":inp_wav_dir,
|
388 |
-
"exp_name":exp_name,
|
389 |
-
"opt_dir":opt_dir,
|
390 |
-
"bert_pretrained_dir":bert_pretrained_dir,
|
391 |
-
}
|
392 |
-
gpu_names=gpu_numbers.split("-")
|
393 |
-
all_parts=len(gpu_names)
|
394 |
-
for i_part in range(all_parts):
|
395 |
-
config.update(
|
396 |
-
{
|
397 |
-
"i_part": str(i_part),
|
398 |
-
"all_parts": str(all_parts),
|
399 |
-
"_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
|
400 |
-
"is_half": str(is_half)
|
401 |
-
}
|
402 |
-
)
|
403 |
-
os.environ.update(config)
|
404 |
-
cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py'%python_exec
|
405 |
-
print(cmd)
|
406 |
-
p = Popen(cmd, shell=True)
|
407 |
-
ps1a.append(p)
|
408 |
-
yield "文本进程执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
|
409 |
-
for p in ps1a:
|
410 |
-
p.wait()
|
411 |
-
opt = []
|
412 |
-
for i_part in range(all_parts):
|
413 |
-
txt_path = "%s/2-name2text-%s.txt" % (opt_dir, i_part)
|
414 |
-
with open(txt_path, "r", encoding="utf8") as f:
|
415 |
-
opt += f.read().strip("\n").split("\n")
|
416 |
-
os.remove(txt_path)
|
417 |
-
path_text = "%s/2-name2text.txt" % opt_dir
|
418 |
-
with open(path_text, "w", encoding="utf8") as f:
|
419 |
-
f.write("\n".join(opt) + "\n")
|
420 |
-
ps1a=[]
|
421 |
-
yield "文本进程结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
|
422 |
-
else:
|
423 |
-
yield "已有正在进行的文本任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
|
424 |
-
|
425 |
-
def close1a():
|
426 |
-
global ps1a
|
427 |
-
if (ps1a != []):
|
428 |
-
for p1a in ps1a:
|
429 |
-
try:
|
430 |
-
kill_process(p1a.pid)
|
431 |
-
except:
|
432 |
-
traceback.print_exc()
|
433 |
-
ps1a=[]
|
434 |
-
return "已终止所有1a进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
|
435 |
-
|
436 |
-
ps1b=[]
|
437 |
-
def open1b(inp_text,inp_wav_dir,exp_name,gpu_numbers,ssl_pretrained_dir):
|
438 |
-
global ps1b
|
439 |
-
inp_text = my_utils.clean_path(inp_text)
|
440 |
-
inp_wav_dir = my_utils.clean_path(inp_wav_dir)
|
441 |
-
if (ps1b == []):
|
442 |
-
config={
|
443 |
-
"inp_text":inp_text,
|
444 |
-
"inp_wav_dir":inp_wav_dir,
|
445 |
-
"exp_name":exp_name,
|
446 |
-
"opt_dir":"%s/%s"%(exp_root,exp_name),
|
447 |
-
"cnhubert_base_dir":ssl_pretrained_dir,
|
448 |
-
"is_half": str(is_half)
|
449 |
-
}
|
450 |
-
gpu_names=gpu_numbers.split("-")
|
451 |
-
all_parts=len(gpu_names)
|
452 |
-
for i_part in range(all_parts):
|
453 |
-
config.update(
|
454 |
-
{
|
455 |
-
"i_part": str(i_part),
|
456 |
-
"all_parts": str(all_parts),
|
457 |
-
"_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
|
458 |
-
}
|
459 |
-
)
|
460 |
-
os.environ.update(config)
|
461 |
-
cmd = '"%s" GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py'%python_exec
|
462 |
-
print(cmd)
|
463 |
-
p = Popen(cmd, shell=True)
|
464 |
-
ps1b.append(p)
|
465 |
-
yield "SSL提取进程执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
|
466 |
-
for p in ps1b:
|
467 |
-
p.wait()
|
468 |
-
ps1b=[]
|
469 |
-
yield "SSL提取进程结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
|
470 |
-
else:
|
471 |
-
yield "
|
472 |
-
|
473 |
-
def close1b():
|
474 |
-
global ps1b
|
475 |
-
if (ps1b != []):
|
476 |
-
for p1b in ps1b:
|
477 |
-
try:
|
478 |
-
kill_process(p1b.pid)
|
479 |
-
except:
|
480 |
-
traceback.print_exc()
|
481 |
-
ps1b=[]
|
482 |
-
return "已终止所有1b进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
|
483 |
-
|
484 |
-
ps1c=[]
|
485 |
-
def open1c(inp_text,exp_name,gpu_numbers,pretrained_s2G_path):
|
486 |
-
global ps1c
|
487 |
-
inp_text = my_utils.clean_path(inp_text)
|
488 |
-
if (ps1c == []):
|
489 |
-
opt_dir="%s/%s"%(exp_root,exp_name)
|
490 |
-
config={
|
491 |
-
"inp_text":inp_text,
|
492 |
-
"exp_name":exp_name,
|
493 |
-
"opt_dir":opt_dir,
|
494 |
-
"pretrained_s2G":pretrained_s2G_path,
|
495 |
-
"s2config_path":"GPT_SoVITS/configs/s2.json",
|
496 |
-
"is_half": str(is_half)
|
497 |
-
}
|
498 |
-
gpu_names=gpu_numbers.split("-")
|
499 |
-
all_parts=len(gpu_names)
|
500 |
-
for i_part in range(all_parts):
|
501 |
-
config.update(
|
502 |
-
{
|
503 |
-
"i_part": str(i_part),
|
504 |
-
"all_parts": str(all_parts),
|
505 |
-
"_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
|
506 |
-
}
|
507 |
-
)
|
508 |
-
os.environ.update(config)
|
509 |
-
cmd = '"%s" GPT_SoVITS/prepare_datasets/3-get-semantic.py'%python_exec
|
510 |
-
print(cmd)
|
511 |
-
p = Popen(cmd, shell=True)
|
512 |
-
ps1c.append(p)
|
513 |
-
yield "语义token提取进程执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
|
514 |
-
for p in ps1c:
|
515 |
-
p.wait()
|
516 |
-
opt = ["item_name\tsemantic_audio"]
|
517 |
-
path_semantic = "%s/6-name2semantic.tsv" % opt_dir
|
518 |
-
for i_part in range(all_parts):
|
519 |
-
semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part)
|
520 |
-
with open(semantic_path, "r", encoding="utf8") as f:
|
521 |
-
opt += f.read().strip("\n").split("\n")
|
522 |
-
os.remove(semantic_path)
|
523 |
-
with open(path_semantic, "w", encoding="utf8") as f:
|
524 |
-
f.write("\n".join(opt) + "\n")
|
525 |
-
ps1c=[]
|
526 |
-
yield "语义token提取进程结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
|
527 |
-
else:
|
528 |
-
yield "
|
529 |
-
|
530 |
-
def close1c():
|
531 |
-
global ps1c
|
532 |
-
if (ps1c != []):
|
533 |
-
for p1c in ps1c:
|
534 |
-
try:
|
535 |
-
kill_process(p1c.pid)
|
536 |
-
except:
|
537 |
-
traceback.print_exc()
|
538 |
-
ps1c=[]
|
539 |
-
return "已终止所有语义token进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
|
540 |
-
#####inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numbers1c,bert_pretrained_dir,cnhubert_base_dir,pretrained_s2G
|
541 |
-
ps1abc=[]
|
542 |
-
def open1abc(inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numbers1c,bert_pretrained_dir,ssl_pretrained_dir,pretrained_s2G_path):
|
543 |
-
global ps1abc
|
544 |
-
inp_text = my_utils.clean_path(inp_text)
|
545 |
-
inp_wav_dir = my_utils.clean_path(inp_wav_dir)
|
546 |
-
if (ps1abc == []):
|
547 |
-
opt_dir="%s/%s"%(exp_root,exp_name)
|
548 |
-
try:
|
549 |
-
#############################1a
|
550 |
-
path_text="%s/2-name2text.txt" % opt_dir
|
551 |
-
if(os.path.exists(path_text)==False or (os.path.exists(path_text)==True and len(open(path_text,"r",encoding="utf8").read().strip("\n").split("\n"))<2)):
|
552 |
-
config={
|
553 |
-
"inp_text":inp_text,
|
554 |
-
"inp_wav_dir":inp_wav_dir,
|
555 |
-
"exp_name":exp_name,
|
556 |
-
"opt_dir":opt_dir,
|
557 |
-
"bert_pretrained_dir":bert_pretrained_dir,
|
558 |
-
"is_half": str(is_half)
|
559 |
-
}
|
560 |
-
gpu_names=gpu_numbers1a.split("-")
|
561 |
-
all_parts=len(gpu_names)
|
562 |
-
for i_part in range(all_parts):
|
563 |
-
config.update(
|
564 |
-
{
|
565 |
-
"i_part": str(i_part),
|
566 |
-
"all_parts": str(all_parts),
|
567 |
-
"_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
|
568 |
-
}
|
569 |
-
)
|
570 |
-
os.environ.update(config)
|
571 |
-
cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py'%python_exec
|
572 |
-
print(cmd)
|
573 |
-
p = Popen(cmd, shell=True)
|
574 |
-
ps1abc.append(p)
|
575 |
-
yield "进度:1a-ing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
|
576 |
-
for p in ps1abc:p.wait()
|
577 |
-
|
578 |
-
opt = []
|
579 |
-
for i_part in range(all_parts):#txt_path="%s/2-name2text-%s.txt"%(opt_dir,i_part)
|
580 |
-
txt_path = "%s/2-name2text-%s.txt" % (opt_dir, i_part)
|
581 |
-
with open(txt_path, "r",encoding="utf8") as f:
|
582 |
-
opt += f.read().strip("\n").split("\n")
|
583 |
-
os.remove(txt_path)
|
584 |
-
with open(path_text, "w",encoding="utf8") as f:
|
585 |
-
f.write("\n".join(opt) + "\n")
|
586 |
-
|
587 |
-
yield "进度:1a-done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
|
588 |
-
ps1abc=[]
|
589 |
-
#############################1b
|
590 |
-
config={
|
591 |
-
"inp_text":inp_text,
|
592 |
-
"inp_wav_dir":inp_wav_dir,
|
593 |
-
"exp_name":exp_name,
|
594 |
-
"opt_dir":opt_dir,
|
595 |
-
"cnhubert_base_dir":ssl_pretrained_dir,
|
596 |
-
}
|
597 |
-
gpu_names=gpu_numbers1Ba.split("-")
|
598 |
-
all_parts=len(gpu_names)
|
599 |
-
for i_part in range(all_parts):
|
600 |
-
config.update(
|
601 |
-
{
|
602 |
-
"i_part": str(i_part),
|
603 |
-
"all_parts": str(all_parts),
|
604 |
-
"_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
|
605 |
-
}
|
606 |
-
)
|
607 |
-
os.environ.update(config)
|
608 |
-
cmd = '"%s" GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py'%python_exec
|
609 |
-
print(cmd)
|
610 |
-
p = Popen(cmd, shell=True)
|
611 |
-
ps1abc.append(p)
|
612 |
-
yield "进度:1a-done, 1b-ing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
|
613 |
-
for p in ps1abc:p.wait()
|
614 |
-
yield "进度:1a1b-done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
|
615 |
-
ps1abc=[]
|
616 |
-
#############################1c
|
617 |
-
path_semantic = "%s/6-name2semantic.tsv" % opt_dir
|
618 |
-
if(os.path.exists(path_semantic)==False or (os.path.exists(path_semantic)==True and os.path.getsize(path_semantic)<31)):
|
619 |
-
config={
|
620 |
-
"inp_text":inp_text,
|
621 |
-
"exp_name":exp_name,
|
622 |
-
"opt_dir":opt_dir,
|
623 |
-
"pretrained_s2G":pretrained_s2G_path,
|
624 |
-
"s2config_path":"GPT_SoVITS/configs/s2.json",
|
625 |
-
}
|
626 |
-
gpu_names=gpu_numbers1c.split("-")
|
627 |
-
all_parts=len(gpu_names)
|
628 |
-
for i_part in range(all_parts):
|
629 |
-
config.update(
|
630 |
-
{
|
631 |
-
"i_part": str(i_part),
|
632 |
-
"all_parts": str(all_parts),
|
633 |
-
"_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
|
634 |
-
}
|
635 |
-
)
|
636 |
-
os.environ.update(config)
|
637 |
-
cmd = '"%s" GPT_SoVITS/prepare_datasets/3-get-semantic.py'%python_exec
|
638 |
-
print(cmd)
|
639 |
-
p = Popen(cmd, shell=True)
|
640 |
-
ps1abc.append(p)
|
641 |
-
yield "进度:1a1b-done, 1cing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
|
642 |
-
for p in ps1abc:p.wait()
|
643 |
-
|
644 |
-
opt = ["item_name\tsemantic_audio"]
|
645 |
-
for i_part in range(all_parts):
|
646 |
-
semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part)
|
647 |
-
with open(semantic_path, "r",encoding="utf8") as f:
|
648 |
-
opt += f.read().strip("\n").split("\n")
|
649 |
-
os.remove(semantic_path)
|
650 |
-
with open(path_semantic, "w",encoding="utf8") as f:
|
651 |
-
f.write("\n".join(opt) + "\n")
|
652 |
-
yield "进度:all-done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
|
653 |
-
ps1abc = []
|
654 |
-
yield "一键三连进程结束", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
|
655 |
-
except:
|
656 |
-
traceback.print_exc()
|
657 |
-
close1abc()
|
658 |
-
yield "一键三连中途报错", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
|
659 |
-
else:
|
660 |
-
yield "已有正在进行的一键三连任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
|
661 |
-
|
662 |
-
def close1abc():
|
663 |
-
global ps1abc
|
664 |
-
if (ps1abc != []):
|
665 |
-
for p1abc in ps1abc:
|
666 |
-
try:
|
667 |
-
kill_process(p1abc.pid)
|
668 |
-
except:
|
669 |
-
traceback.print_exc()
|
670 |
-
ps1abc=[]
|
671 |
-
return "已终止所有一键三连进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
|
672 |
-
|
673 |
-
with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
674 |
-
gr.Markdown(
|
675 |
-
value=
|
676 |
-
i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>.")
|
677 |
-
)
|
678 |
-
gr.Markdown(
|
679 |
-
value=
|
680 |
-
i18n("中文教程文档:https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e")
|
681 |
-
)
|
682 |
-
|
683 |
-
with gr.Tabs():
|
684 |
-
with gr.TabItem(i18n("0-前置数据集获取工具")):#提前随机切片防止uvr5爆内存->uvr5->slicer->asr->打标
|
685 |
-
gr.Markdown(value=i18n("0a-UVR5人声伴奏分离&去混响去延迟工具"))
|
686 |
-
with gr.Row():
|
687 |
-
if_uvr5 = gr.Checkbox(label=i18n("是否开启UVR5-WebUI"),show_label=True)
|
688 |
-
uvr5_info = gr.Textbox(label=i18n("UVR5进程输出信息"))
|
689 |
-
gr.Markdown(value=i18n("0b-语音切分工具"))
|
690 |
-
with gr.Row():
|
691 |
-
with gr.Row():
|
692 |
-
slice_inp_path=gr.Textbox(label=i18n("音频自动切分输入路径,可文件可文件夹"),value="")
|
693 |
-
slice_opt_root=gr.Textbox(label=i18n("切分后的子音频的输出根目录"),value="output/slicer_opt")
|
694 |
-
threshold=gr.Textbox(label=i18n("threshold:音量小于这个值视作静音的备选切割点"),value="-34")
|
695 |
-
min_length=gr.Textbox(label=i18n("min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值"),value="4000")
|
696 |
-
min_interval=gr.Textbox(label=i18n("min_interval:最短切割间隔"),value="300")
|
697 |
-
hop_size=gr.Textbox(label=i18n("hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)"),value="10")
|
698 |
-
max_sil_kept=gr.Textbox(label=i18n("max_sil_kept:切完后静音最多留多长"),value="500")
|
699 |
-
with gr.Row():
|
700 |
-
open_slicer_button=gr.Button(i18n("开启语音切割"), variant="primary",visible=True)
|
701 |
-
close_slicer_button=gr.Button(i18n("终止语音切割"), variant="primary",visible=False)
|
702 |
-
_max=gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("max:归一化后最大值多少"),value=0.9,interactive=True)
|
703 |
-
alpha=gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("alpha_mix:混多少比例归一化后音频进来"),value=0.25,interactive=True)
|
704 |
-
n_process=gr.Slider(minimum=1,maximum=n_cpu,step=1,label=i18n("切割使用的进程数"),value=4,interactive=True)
|
705 |
-
slicer_info = gr.Textbox(label=i18n("
|
706 |
-
gr.Markdown(value=i18n("0bb-语音降噪工具"))
|
707 |
-
with gr.Row():
|
708 |
-
open_denoise_button = gr.Button(i18n("开启语音降噪"), visible=True)
|
709 |
-
close_denoise_button = gr.Button(i18n("终止语音降噪进程"), variant="primary",visible=False)
|
710 |
-
denoise_input_dir=gr.Textbox(label=i18n("降噪音频文件输入文件夹"),value="")
|
711 |
-
denoise_output_dir=gr.Textbox(label=i18n("降噪结果输出文件夹"),value="output/denoise_opt")
|
712 |
-
denoise_info = gr.Textbox(label=i18n("语音降噪进程输出信息"))
|
713 |
-
gr.Markdown(value=i18n("0c-中文批量离线ASR工具"))
|
714 |
-
with gr.Row():
|
715 |
-
open_asr_button = gr.Button(i18n("开启离线批量ASR"), variant="primary",visible=True)
|
716 |
-
close_asr_button = gr.Button(i18n("终止ASR进程"), variant="primary",visible=False)
|
717 |
-
with gr.Column():
|
718 |
-
with gr.Row():
|
719 |
-
asr_inp_dir = gr.Textbox(
|
720 |
-
label=i18n("输入文件夹路径"),
|
721 |
-
value="
|
722 |
-
interactive=True,
|
723 |
-
)
|
724 |
-
asr_opt_dir = gr.Textbox(
|
725 |
-
label = i18n("输出文件夹路径"),
|
726 |
-
value = "output/asr_opt",
|
727 |
-
interactive = True,
|
728 |
-
)
|
729 |
-
with gr.Row():
|
730 |
-
asr_model = gr.Dropdown(
|
731 |
-
label = i18n("ASR 模型"),
|
732 |
-
choices = list(asr_dict.keys()),
|
733 |
-
interactive = True,
|
734 |
-
value="达摩 ASR (中文)"
|
735 |
-
)
|
736 |
-
asr_size = gr.Dropdown(
|
737 |
-
label = i18n("ASR 模型尺寸"),
|
738 |
-
choices = ["large"],
|
739 |
-
interactive = True,
|
740 |
-
value="large"
|
741 |
-
)
|
742 |
-
asr_lang = gr.Dropdown(
|
743 |
-
label = i18n("ASR 语言设置"),
|
744 |
-
choices = ["zh"],
|
745 |
-
interactive = True,
|
746 |
-
value="zh"
|
747 |
-
)
|
748 |
-
with gr.Row():
|
749 |
-
asr_info = gr.Textbox(label=i18n("ASR进程输出信息"))
|
750 |
-
|
751 |
-
def change_lang_choices(key): #根据选择的模型修改可选的语言
|
752 |
-
# return gr.Dropdown(choices=asr_dict[key]['lang'])
|
753 |
-
return {"__type__": "update", "choices": asr_dict[key]['lang'],"value":asr_dict[key]['lang'][0]}
|
754 |
-
def change_size_choices(key): # 根据选择的模型修改可选的模型尺寸
|
755 |
-
# return gr.Dropdown(choices=asr_dict[key]['size'])
|
756 |
-
return {"__type__": "update", "choices": asr_dict[key]['size']}
|
757 |
-
asr_model.change(change_lang_choices, [asr_model], [asr_lang])
|
758 |
-
asr_model.change(change_size_choices, [asr_model], [asr_size])
|
759 |
-
|
760 |
-
gr.Markdown(value=i18n("0d-语音文本校对标注工具"))
|
761 |
-
with gr.Row():
|
762 |
-
if_label = gr.Checkbox(label=i18n("是否开启打标WebUI"),show_label=True)
|
763 |
-
path_list = gr.Textbox(
|
764 |
-
label=i18n(".list标注文件的路径"),
|
765 |
-
value="
|
766 |
-
interactive=True,
|
767 |
-
)
|
768 |
-
label_info = gr.Textbox(label=i18n("打标工具进程输出信息"))
|
769 |
-
if_label.change(change_label, [if_label,path_list], [label_info])
|
770 |
-
if_uvr5.change(change_uvr5, [if_uvr5], [uvr5_info])
|
771 |
-
open_asr_button.click(open_asr, [asr_inp_dir, asr_opt_dir, asr_model, asr_size, asr_lang], [asr_info,open_asr_button,close_asr_button])
|
772 |
-
close_asr_button.click(close_asr, [], [asr_info,open_asr_button,close_asr_button])
|
773 |
-
open_slicer_button.click(open_slice, [slice_inp_path,slice_opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,n_process], [slicer_info,open_slicer_button,close_slicer_button])
|
774 |
-
close_slicer_button.click(close_slice, [], [slicer_info,open_slicer_button,close_slicer_button])
|
775 |
-
open_denoise_button.click(open_denoise, [denoise_input_dir,denoise_output_dir], [denoise_info,open_denoise_button,close_denoise_button])
|
776 |
-
close_denoise_button.click(close_denoise, [], [denoise_info,open_denoise_button,close_denoise_button])
|
777 |
-
|
778 |
-
with gr.TabItem(i18n("1-GPT-SoVITS-TTS")):
|
779 |
-
with gr.Row():
|
780 |
-
exp_name = gr.Textbox(label=i18n("*实验/模型名"), value="xxx", interactive=True)
|
781 |
-
gpu_info = gr.Textbox(label=i18n("显卡信息"), value=gpu_info, visible=True, interactive=False)
|
782 |
-
pretrained_s2G = gr.Textbox(label=i18n("预训练的SoVITS-G模型路径"), value="GPT_SoVITS/pretrained_models/s2G488k.pth", interactive=True)
|
783 |
-
pretrained_s2D = gr.Textbox(label=i18n("预训练的SoVITS-D模型路径"), value="GPT_SoVITS/pretrained_models/s2D488k.pth", interactive=True)
|
784 |
-
pretrained_s1 = gr.Textbox(label=i18n("预训练的GPT模型路径"), value="GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt", interactive=True)
|
785 |
-
with gr.TabItem(i18n("1A-训练集格式化工具")):
|
786 |
-
gr.Markdown(value=i18n("输出logs/实验名目录下应有23456开头的文件和文件夹"))
|
787 |
-
with gr.Row():
|
788 |
-
inp_text = gr.Textbox(label=i18n("*文本标注文件"),value="
|
789 |
-
inp_wav_dir = gr.Textbox(
|
790 |
-
label=i18n("*训练集音频文件目录"),
|
791 |
-
value="
|
792 |
-
interactive=True,
|
793 |
-
placeholder=i18n("填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名(不是全路径)。如果留空则使用.list文件里的绝对全路径。")
|
794 |
-
)
|
795 |
-
gr.Markdown(value=i18n("1Aa-文本内容"))
|
796 |
-
with gr.Row():
|
797 |
-
gpu_numbers1a = gr.Textbox(label=i18n("GPU卡号以-分割,每个卡号一个进程"),value="%s-%s"%(gpus,gpus),interactive=True)
|
798 |
-
bert_pretrained_dir = gr.Textbox(label=i18n("预训练的中文BERT模型路径"),value="GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",interactive=False)
|
799 |
-
button1a_open = gr.Button(i18n("开启文本获取"), variant="primary",visible=True)
|
800 |
-
button1a_close = gr.Button(i18n("终止文本获取进程"), variant="primary",visible=False)
|
801 |
-
info1a=gr.Textbox(label=i18n("文本进程输出信息"))
|
802 |
-
gr.Markdown(value=i18n("1Ab-SSL自监督特征提取"))
|
803 |
-
with gr.Row():
|
804 |
-
gpu_numbers1Ba = gr.Textbox(label=i18n("GPU卡号以-分割,每个卡号一个进程"),value="%s-%s"%(gpus,gpus),interactive=True)
|
805 |
-
cnhubert_base_dir = gr.Textbox(label=i18n("预训练的SSL模型路径"),value="GPT_SoVITS/pretrained_models/chinese-hubert-base",interactive=False)
|
806 |
-
button1b_open = gr.Button(i18n("开启SSL提取"), variant="primary",visible=True)
|
807 |
-
button1b_close = gr.Button(i18n("终止SSL提取进程"), variant="primary",visible=False)
|
808 |
-
info1b=gr.Textbox(label=i18n("SSL进程输出信息"))
|
809 |
-
gr.Markdown(value=i18n("1Ac-语义token提取"))
|
810 |
-
with gr.Row():
|
811 |
-
gpu_numbers1c = gr.Textbox(label=i18n("GPU卡号以-分割,每个卡号一个进程"),value="%s-%s"%(gpus,gpus),interactive=True)
|
812 |
-
button1c_open = gr.Button(i18n("开启语义token提取"), variant="primary",visible=True)
|
813 |
-
button1c_close = gr.Button(i18n("终止语义token提取进程"), variant="primary",visible=False)
|
814 |
-
info1c=gr.Textbox(label=i18n("语义token提取进程输出信息"))
|
815 |
-
gr.Markdown(value=i18n("1Aabc-训练集格式化一键三连"))
|
816 |
-
with gr.Row():
|
817 |
-
button1abc_open = gr.Button(i18n("开启一键三连"), variant="primary",visible=True)
|
818 |
-
button1abc_close = gr.Button(i18n("终止一键三连"), variant="primary",visible=False)
|
819 |
-
info1abc=gr.Textbox(label=i18n("一键三连进程输出信息"))
|
820 |
-
button1a_open.click(open1a, [inp_text,inp_wav_dir,exp_name,gpu_numbers1a,bert_pretrained_dir], [info1a,button1a_open,button1a_close])
|
821 |
-
button1a_close.click(close1a, [], [info1a,button1a_open,button1a_close])
|
822 |
-
button1b_open.click(open1b, [inp_text,inp_wav_dir,exp_name,gpu_numbers1Ba,cnhubert_base_dir], [info1b,button1b_open,button1b_close])
|
823 |
-
button1b_close.click(close1b, [], [info1b,button1b_open,button1b_close])
|
824 |
-
button1c_open.click(open1c, [inp_text,exp_name,gpu_numbers1c,pretrained_s2G], [info1c,button1c_open,button1c_close])
|
825 |
-
button1c_close.click(close1c, [], [info1c,button1c_open,button1c_close])
|
826 |
-
button1abc_open.click(open1abc, [inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numbers1c,bert_pretrained_dir,cnhubert_base_dir,pretrained_s2G], [info1abc,button1abc_open,button1abc_close])
|
827 |
-
button1abc_close.click(close1abc, [], [info1abc,button1abc_open,button1abc_close])
|
828 |
-
with gr.TabItem(i18n("1B-微调训练")):
|
829 |
-
gr.Markdown(value=i18n("1Ba-SoVITS训练。用于分享的模型文件输出在SoVITS_weights下。"))
|
830 |
-
with gr.Row():
|
831 |
-
batch_size = gr.Slider(minimum=1,maximum=40,step=1,label=i18n("每张显卡的batch_size"),value=default_batch_size,interactive=True)
|
832 |
-
total_epoch = gr.Slider(minimum=1,maximum=25,step=1,label=i18n("总训练轮数total_epoch,不建议太高"),value=8,interactive=True)
|
833 |
-
text_low_lr_rate = gr.Slider(minimum=0.2,maximum=0.6,step=0.05,label=i18n("文本模块学习率权重"),value=0.4,interactive=True)
|
834 |
-
save_every_epoch = gr.Slider(minimum=1,maximum=25,step=1,label=i18n("保存频率save_every_epoch"),value=4,interactive=True)
|
835 |
-
if_save_latest = gr.Checkbox(label=i18n("是否仅保存最新的ckpt文件以节省硬盘空间"), value=True, interactive=True, show_label=True)
|
836 |
-
if_save_every_weights = gr.Checkbox(label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"), value=True, interactive=True, show_label=True)
|
837 |
-
gpu_numbers1Ba = gr.Textbox(label=i18n("GPU卡号以-分割,每个卡号一个进程"), value="%s" % (gpus), interactive=True)
|
838 |
-
with gr.Row():
|
839 |
-
button1Ba_open = gr.Button(i18n("开启SoVITS训练"), variant="primary",visible=True)
|
840 |
-
button1Ba_close = gr.Button(i18n("终止SoVITS训练"), variant="primary",visible=False)
|
841 |
-
info1Ba=gr.Textbox(label=i18n("SoVITS训练进程输出信息"))
|
842 |
-
gr.Markdown(value=i18n("1Bb-GPT训练。用于分享的模型文件输出在GPT_weights下。"))
|
843 |
-
with gr.Row():
|
844 |
-
batch_size1Bb = gr.Slider(minimum=1,maximum=40,step=1,label=i18n("每张显卡的batch_size"),value=default_batch_size,interactive=True)
|
845 |
-
total_epoch1Bb = gr.Slider(minimum=2,maximum=50,step=1,label=i18n("总训练轮数total_epoch"),value=15,interactive=True)
|
846 |
-
if_dpo = gr.Checkbox(label=i18n("是否开启dpo训练选项(实验性)"), value=False, interactive=True, show_label=True)
|
847 |
-
if_save_latest1Bb = gr.Checkbox(label=i18n("是否仅保存最新的ckpt文件以节省硬盘空间"), value=True, interactive=True, show_label=True)
|
848 |
-
if_save_every_weights1Bb = gr.Checkbox(label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"), value=True, interactive=True, show_label=True)
|
849 |
-
save_every_epoch1Bb = gr.Slider(minimum=1,maximum=50,step=1,label=i18n("保存频率save_every_epoch"),value=5,interactive=True)
|
850 |
-
gpu_numbers1Bb = gr.Textbox(label=i18n("GPU卡号以-分割,每个卡号一个进程"), value="%s" % (gpus), interactive=True)
|
851 |
-
with gr.Row():
|
852 |
-
button1Bb_open = gr.Button(i18n("开启GPT训练"), variant="primary",visible=True)
|
853 |
-
button1Bb_close = gr.Button(i18n("终止GPT训练"), variant="primary",visible=False)
|
854 |
-
info1Bb=gr.Textbox(label=i18n("GPT训练进程输出信息"))
|
855 |
-
button1Ba_open.click(open1Ba, [batch_size,total_epoch,exp_name,text_low_lr_rate,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers1Ba,pretrained_s2G,pretrained_s2D], [info1Ba,button1Ba_open,button1Ba_close])
|
856 |
-
button1Ba_close.click(close1Ba, [], [info1Ba,button1Ba_open,button1Ba_close])
|
857 |
-
button1Bb_open.click(open1Bb, [batch_size1Bb,total_epoch1Bb,exp_name,if_dpo,if_save_latest1Bb,if_save_every_weights1Bb,save_every_epoch1Bb,gpu_numbers1Bb,pretrained_s1], [info1Bb,button1Bb_open,button1Bb_close])
|
858 |
-
button1Bb_close.click(close1Bb, [], [info1Bb,button1Bb_open,button1Bb_close])
|
859 |
-
with gr.TabItem(i18n("1C-推理")):
|
860 |
-
gr.Markdown(value=i18n("选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。"))
|
861 |
-
with gr.Row():
|
862 |
-
GPT_dropdown = gr.Dropdown(label=i18n("*GPT模型列表"), choices=sorted(GPT_names,key=custom_sort_key),value=pretrained_gpt_name,interactive=True)
|
863 |
-
SoVITS_dropdown = gr.Dropdown(label=i18n("*SoVITS模型列表"), choices=sorted(SoVITS_names,key=custom_sort_key),value=pretrained_sovits_name,interactive=True)
|
864 |
-
gpu_number_1C=gr.Textbox(label=i18n("GPU卡号,只能填1个整数"), value=gpus, interactive=True)
|
865 |
-
refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary")
|
866 |
-
refresh_button.click(fn=change_choices,inputs=[],outputs=[SoVITS_dropdown,GPT_dropdown])
|
867 |
-
with gr.Row():
|
868 |
-
if_tts = gr.Checkbox(label=i18n("是否开启TTS推理WebUI"), show_label=True)
|
869 |
-
tts_info = gr.Textbox(label=i18n("TTS推理WebUI进程输出信息"))
|
870 |
-
if_tts.change(change_tts_inference, [if_tts,bert_pretrained_dir,cnhubert_base_dir,gpu_number_1C,GPT_dropdown,SoVITS_dropdown], [tts_info])
|
871 |
-
with gr.TabItem(i18n("2-GPT-SoVITS-变声")):gr.Markdown(value=i18n("
|
872 |
-
app.queue(concurrency_count=511, max_size=1022).launch(
|
873 |
-
server_name="0.0.0.0",
|
874 |
-
inbrowser=True,
|
875 |
-
share=
|
876 |
-
server_port=webui_port_main,
|
877 |
-
quiet=True,
|
878 |
-
)
|
|
|
1 |
+
import os,shutil,sys,pdb,re
|
2 |
+
now_dir = os.getcwd()
|
3 |
+
sys.path.insert(0, now_dir)
|
4 |
+
import json,yaml,warnings,torch
|
5 |
+
import platform
|
6 |
+
import psutil
|
7 |
+
import signal
|
8 |
+
|
9 |
+
warnings.filterwarnings("ignore")
|
10 |
+
torch.manual_seed(233333)
|
11 |
+
tmp = os.path.join(now_dir, "TEMP")
|
12 |
+
os.makedirs(tmp, exist_ok=True)
|
13 |
+
os.environ["TEMP"] = tmp
|
14 |
+
if(os.path.exists(tmp)):
|
15 |
+
for name in os.listdir(tmp):
|
16 |
+
if(name=="jieba.cache"):continue
|
17 |
+
path="%s/%s"%(tmp,name)
|
18 |
+
delete=os.remove if os.path.isfile(path) else shutil.rmtree
|
19 |
+
try:
|
20 |
+
delete(path)
|
21 |
+
except Exception as e:
|
22 |
+
print(str(e))
|
23 |
+
pass
|
24 |
+
import site
|
25 |
+
site_packages_roots = []
|
26 |
+
for path in site.getsitepackages():
|
27 |
+
if "packages" in path:
|
28 |
+
site_packages_roots.append(path)
|
29 |
+
if(site_packages_roots==[]):site_packages_roots=["%s/runtime/Lib/site-packages" % now_dir]
|
30 |
+
#os.environ["OPENBLAS_NUM_THREADS"] = "4"
|
31 |
+
os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1"
|
32 |
+
os.environ["all_proxy"] = ""
|
33 |
+
for site_packages_root in site_packages_roots:
|
34 |
+
if os.path.exists(site_packages_root):
|
35 |
+
try:
|
36 |
+
with open("%s/users.pth" % (site_packages_root), "w") as f:
|
37 |
+
f.write(
|
38 |
+
"%s\n%s/tools\n%s/tools/damo_asr\n%s/GPT_SoVITS\n%s/tools/uvr5"
|
39 |
+
% (now_dir, now_dir, now_dir, now_dir, now_dir)
|
40 |
+
)
|
41 |
+
break
|
42 |
+
except PermissionError:
|
43 |
+
pass
|
44 |
+
from tools import my_utils
|
45 |
+
import traceback
|
46 |
+
import shutil
|
47 |
+
import pdb
|
48 |
+
import gradio as gr
|
49 |
+
from subprocess import Popen
|
50 |
+
import signal
|
51 |
+
from config import python_exec,infer_device,is_half,exp_root,webui_port_main,webui_port_infer_tts,webui_port_uvr5,webui_port_subfix,is_share
|
52 |
+
from tools.i18n.i18n import I18nAuto
|
53 |
+
i18n = I18nAuto()
|
54 |
+
from scipy.io import wavfile
|
55 |
+
from tools.my_utils import load_audio
|
56 |
+
from multiprocessing import cpu_count
|
57 |
+
|
58 |
+
# os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 当遇到mps不支持的步骤时使用cpu
|
59 |
+
|
60 |
+
n_cpu=cpu_count()
|
61 |
+
|
62 |
+
ngpu = torch.cuda.device_count()
|
63 |
+
gpu_infos = []
|
64 |
+
mem = []
|
65 |
+
if_gpu_ok = False
|
66 |
+
|
67 |
+
# 判断是否有能用来训练和加速推理的N卡
|
68 |
+
if torch.cuda.is_available() or ngpu != 0:
|
69 |
+
for i in range(ngpu):
|
70 |
+
gpu_name = torch.cuda.get_device_name(i)
|
71 |
+
if any(value in gpu_name.upper()for value in ["10","16","20","30","40","A2","A3","A4","P4","A50","500","A60","70","80","90","M4","T4","TITAN","L4","4060"]):
|
72 |
+
# A10#A100#V100#A40#P40#M40#K80#A4500
|
73 |
+
if_gpu_ok = True # 至少有一张能用的N卡
|
74 |
+
gpu_infos.append("%s\t%s" % (i, gpu_name))
|
75 |
+
mem.append(int(torch.cuda.get_device_properties(i).total_memory/ 1024/ 1024/ 1024+ 0.4))
|
76 |
+
# # 判断是否支持mps加速
|
77 |
+
# if torch.backends.mps.is_available():
|
78 |
+
# if_gpu_ok = True
|
79 |
+
# gpu_infos.append("%s\t%s" % ("0", "Apple GPU"))
|
80 |
+
# mem.append(psutil.virtual_memory().total/ 1024 / 1024 / 1024) # 实测使用系统内存作为显存不会爆显存
|
81 |
+
|
82 |
+
if if_gpu_ok and len(gpu_infos) > 0:
|
83 |
+
gpu_info = "\n".join(gpu_infos)
|
84 |
+
default_batch_size = min(mem) // 2
|
85 |
+
else:
|
86 |
+
gpu_info = ("%s\t%s" % ("0", "CPU"))
|
87 |
+
gpu_infos.append("%s\t%s" % ("0", "CPU"))
|
88 |
+
default_batch_size = psutil.virtual_memory().total/ 1024 / 1024 / 1024 / 2
|
89 |
+
gpus = "-".join([i[0] for i in gpu_infos])
|
90 |
+
|
91 |
+
pretrained_sovits_name="GPT_SoVITS/pretrained_models/s2G488k.pth"
|
92 |
+
pretrained_gpt_name="GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
|
93 |
+
def get_weights_names():
|
94 |
+
SoVITS_names = [pretrained_sovits_name]
|
95 |
+
for name in os.listdir(SoVITS_weight_root):
|
96 |
+
if name.endswith(".pth"):SoVITS_names.append(name)
|
97 |
+
GPT_names = [pretrained_gpt_name]
|
98 |
+
for name in os.listdir(GPT_weight_root):
|
99 |
+
if name.endswith(".ckpt"): GPT_names.append(name)
|
100 |
+
return SoVITS_names,GPT_names
|
101 |
+
SoVITS_weight_root="SoVITS_weights"
|
102 |
+
GPT_weight_root="GPT_weights"
|
103 |
+
os.makedirs(SoVITS_weight_root,exist_ok=True)
|
104 |
+
os.makedirs(GPT_weight_root,exist_ok=True)
|
105 |
+
SoVITS_names,GPT_names = get_weights_names()
|
106 |
+
|
107 |
+
def custom_sort_key(s):
|
108 |
+
# 使用正则表达式提取字符串中的数字部分和非数字部分
|
109 |
+
parts = re.split('(\d+)', s)
|
110 |
+
# 将数字部分转换为整数,非数字部分保持不变
|
111 |
+
parts = [int(part) if part.isdigit() else part for part in parts]
|
112 |
+
return parts
|
113 |
+
|
114 |
+
def change_choices():
|
115 |
+
SoVITS_names, GPT_names = get_weights_names()
|
116 |
+
return {"choices": sorted(SoVITS_names,key=custom_sort_key), "__type__": "update"}, {"choices": sorted(GPT_names,key=custom_sort_key), "__type__": "update"}
|
117 |
+
|
118 |
+
p_label=None
|
119 |
+
p_uvr5=None
|
120 |
+
p_asr=None
|
121 |
+
p_denoise=None
|
122 |
+
p_tts_inference=None
|
123 |
+
|
124 |
+
def kill_proc_tree(pid, including_parent=True):
|
125 |
+
try:
|
126 |
+
parent = psutil.Process(pid)
|
127 |
+
except psutil.NoSuchProcess:
|
128 |
+
# Process already terminated
|
129 |
+
return
|
130 |
+
|
131 |
+
children = parent.children(recursive=True)
|
132 |
+
for child in children:
|
133 |
+
try:
|
134 |
+
os.kill(child.pid, signal.SIGTERM) # or signal.SIGKILL
|
135 |
+
except OSError:
|
136 |
+
pass
|
137 |
+
if including_parent:
|
138 |
+
try:
|
139 |
+
os.kill(parent.pid, signal.SIGTERM) # or signal.SIGKILL
|
140 |
+
except OSError:
|
141 |
+
pass
|
142 |
+
|
143 |
+
system=platform.system()
|
144 |
+
def kill_process(pid):
|
145 |
+
if(system=="Windows"):
|
146 |
+
cmd = "taskkill /t /f /pid %s" % pid
|
147 |
+
os.system(cmd)
|
148 |
+
else:
|
149 |
+
kill_proc_tree(pid)
|
150 |
+
|
151 |
+
|
152 |
+
def change_label(if_label,path_list):
|
153 |
+
global p_label
|
154 |
+
if(if_label==True and p_label==None):
|
155 |
+
path_list=my_utils.clean_path(path_list)
|
156 |
+
cmd = '"%s" tools/subfix_webui.py --load_list "%s" --webui_port %s --is_share %s'%(python_exec,path_list,webui_port_subfix,is_share)
|
157 |
+
yield i18n("打标工具WebUI已开启")
|
158 |
+
print(cmd)
|
159 |
+
p_label = Popen(cmd, shell=True)
|
160 |
+
elif(if_label==False and p_label!=None):
|
161 |
+
kill_process(p_label.pid)
|
162 |
+
p_label=None
|
163 |
+
yield i18n("打标工具WebUI已关闭")
|
164 |
+
|
165 |
+
def change_uvr5(if_uvr5):
|
166 |
+
global p_uvr5
|
167 |
+
if(if_uvr5==True and p_uvr5==None):
|
168 |
+
cmd = '"%s" tools/uvr5/webui.py "%s" %s %s %s'%(python_exec,infer_device,is_half,webui_port_uvr5,is_share)
|
169 |
+
yield i18n("UVR5已开启")
|
170 |
+
print(cmd)
|
171 |
+
p_uvr5 = Popen(cmd, shell=True)
|
172 |
+
elif(if_uvr5==False and p_uvr5!=None):
|
173 |
+
kill_process(p_uvr5.pid)
|
174 |
+
p_uvr5=None
|
175 |
+
yield i18n("UVR5已关闭")
|
176 |
+
|
177 |
+
def change_tts_inference(if_tts,bert_path,cnhubert_base_path,gpu_number,gpt_path,sovits_path):
|
178 |
+
global p_tts_inference
|
179 |
+
if(if_tts==True and p_tts_inference==None):
|
180 |
+
os.environ["gpt_path"]=gpt_path if "/" in gpt_path else "%s/%s"%(GPT_weight_root,gpt_path)
|
181 |
+
os.environ["sovits_path"]=sovits_path if "/"in sovits_path else "%s/%s"%(SoVITS_weight_root,sovits_path)
|
182 |
+
os.environ["cnhubert_base_path"]=cnhubert_base_path
|
183 |
+
os.environ["bert_path"]=bert_path
|
184 |
+
os.environ["_CUDA_VISIBLE_DEVICES"]=gpu_number
|
185 |
+
os.environ["is_half"]=str(is_half)
|
186 |
+
os.environ["infer_ttswebui"]=str(webui_port_infer_tts)
|
187 |
+
os.environ["is_share"]=str(is_share)
|
188 |
+
cmd = '"%s" GPT_SoVITS/inference_webui.py'%(python_exec)
|
189 |
+
yield i18n("TTS推理进程已开启")
|
190 |
+
print(cmd)
|
191 |
+
p_tts_inference = Popen(cmd, shell=True)
|
192 |
+
elif(if_tts==False and p_tts_inference!=None):
|
193 |
+
kill_process(p_tts_inference.pid)
|
194 |
+
p_tts_inference=None
|
195 |
+
yield i18n("TTS推理进程已关闭")
|
196 |
+
|
197 |
+
from tools.asr.config import asr_dict
|
198 |
+
def open_asr(asr_inp_dir, asr_opt_dir, asr_model, asr_model_size, asr_lang):
|
199 |
+
global p_asr
|
200 |
+
if(p_asr==None):
|
201 |
+
asr_inp_dir=my_utils.clean_path(asr_inp_dir)
|
202 |
+
cmd = f'"{python_exec}" tools/asr/{asr_dict[asr_model]["path"]}'
|
203 |
+
cmd += f' -i "{asr_inp_dir}"'
|
204 |
+
cmd += f' -o "{asr_opt_dir}"'
|
205 |
+
cmd += f' -s {asr_model_size}'
|
206 |
+
cmd += f' -l {asr_lang}'
|
207 |
+
cmd += " -p %s"%("float16"if is_half==True else "float32")
|
208 |
+
|
209 |
+
yield "ASR任务开启:%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True}
|
210 |
+
print(cmd)
|
211 |
+
p_asr = Popen(cmd, shell=True)
|
212 |
+
p_asr.wait()
|
213 |
+
p_asr=None
|
214 |
+
yield f"ASR任务完成, 查看终端进行下一步",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
|
215 |
+
else:
|
216 |
+
yield "已有正在进行的ASR任务,需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True}
|
217 |
+
# return None
|
218 |
+
|
219 |
+
def close_asr():
|
220 |
+
global p_asr
|
221 |
+
if(p_asr!=None):
|
222 |
+
kill_process(p_asr.pid)
|
223 |
+
p_asr=None
|
224 |
+
return "已终止ASR进程",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
|
225 |
+
def open_denoise(denoise_inp_dir, denoise_opt_dir):
|
226 |
+
global p_denoise
|
227 |
+
if(p_denoise==None):
|
228 |
+
denoise_inp_dir=my_utils.clean_path(denoise_inp_dir)
|
229 |
+
denoise_opt_dir=my_utils.clean_path(denoise_opt_dir)
|
230 |
+
cmd = '"%s" tools/cmd-denoise.py -i "%s" -o "%s" -p %s'%(python_exec,denoise_inp_dir,denoise_opt_dir,"float16"if is_half==True else "float32")
|
231 |
+
|
232 |
+
yield "语音降噪任务开启:%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True}
|
233 |
+
print(cmd)
|
234 |
+
p_denoise = Popen(cmd, shell=True)
|
235 |
+
p_denoise.wait()
|
236 |
+
p_denoise=None
|
237 |
+
yield f"语音降噪任务完成, 查看终端进行下一步",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
|
238 |
+
else:
|
239 |
+
yield "已有正在进行的语音降噪任务,需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True}
|
240 |
+
# return None
|
241 |
+
|
242 |
+
def close_denoise():
|
243 |
+
global p_denoise
|
244 |
+
if(p_denoise!=None):
|
245 |
+
kill_process(p_denoise.pid)
|
246 |
+
p_denoise=None
|
247 |
+
return "已终止语音降噪进程",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
|
248 |
+
|
249 |
+
p_train_SoVITS=None
|
250 |
+
def open1Ba(batch_size,total_epoch,exp_name,text_low_lr_rate,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers1Ba,pretrained_s2G,pretrained_s2D):
|
251 |
+
global p_train_SoVITS
|
252 |
+
if(p_train_SoVITS==None):
|
253 |
+
with open("GPT_SoVITS/configs/s2.json")as f:
|
254 |
+
data=f.read()
|
255 |
+
data=json.loads(data)
|
256 |
+
s2_dir="%s/%s"%(exp_root,exp_name)
|
257 |
+
os.makedirs("%s/logs_s2"%(s2_dir),exist_ok=True)
|
258 |
+
if(is_half==False):
|
259 |
+
data["train"]["fp16_run"]=False
|
260 |
+
batch_size=max(1,batch_size//2)
|
261 |
+
data["train"]["batch_size"]=batch_size
|
262 |
+
data["train"]["epochs"]=total_epoch
|
263 |
+
data["train"]["text_low_lr_rate"]=text_low_lr_rate
|
264 |
+
data["train"]["pretrained_s2G"]=pretrained_s2G
|
265 |
+
data["train"]["pretrained_s2D"]=pretrained_s2D
|
266 |
+
data["train"]["if_save_latest"]=if_save_latest
|
267 |
+
data["train"]["if_save_every_weights"]=if_save_every_weights
|
268 |
+
data["train"]["save_every_epoch"]=save_every_epoch
|
269 |
+
data["train"]["gpu_numbers"]=gpu_numbers1Ba
|
270 |
+
data["data"]["exp_dir"]=data["s2_ckpt_dir"]=s2_dir
|
271 |
+
data["save_weight_dir"]=SoVITS_weight_root
|
272 |
+
data["name"]=exp_name
|
273 |
+
tmp_config_path="%s/tmp_s2.json"%tmp
|
274 |
+
with open(tmp_config_path,"w")as f:f.write(json.dumps(data))
|
275 |
+
|
276 |
+
cmd = '"%s" GPT_SoVITS/s2_train.py --config "%s"'%(python_exec,tmp_config_path)
|
277 |
+
yield "SoVITS训练开始:%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True}
|
278 |
+
print(cmd)
|
279 |
+
p_train_SoVITS = Popen(cmd, shell=True)
|
280 |
+
p_train_SoVITS.wait()
|
281 |
+
p_train_SoVITS=None
|
282 |
+
yield "SoVITS训练完成",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
|
283 |
+
else:
|
284 |
+
yield "已有正在进行的SoVITS训练任务,需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True}
|
285 |
+
|
286 |
+
def close1Ba():
|
287 |
+
global p_train_SoVITS
|
288 |
+
if(p_train_SoVITS!=None):
|
289 |
+
kill_process(p_train_SoVITS.pid)
|
290 |
+
p_train_SoVITS=None
|
291 |
+
return "已终止SoVITS训练",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
|
292 |
+
|
293 |
+
p_train_GPT=None
|
294 |
+
def open1Bb(batch_size,total_epoch,exp_name,if_dpo,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers,pretrained_s1):
|
295 |
+
global p_train_GPT
|
296 |
+
if(p_train_GPT==None):
|
297 |
+
with open("GPT_SoVITS/configs/s1longer.yaml")as f:
|
298 |
+
data=f.read()
|
299 |
+
data=yaml.load(data, Loader=yaml.FullLoader)
|
300 |
+
s1_dir="%s/%s"%(exp_root,exp_name)
|
301 |
+
os.makedirs("%s/logs_s1"%(s1_dir),exist_ok=True)
|
302 |
+
if(is_half==False):
|
303 |
+
data["train"]["precision"]="32"
|
304 |
+
batch_size = max(1, batch_size // 2)
|
305 |
+
data["train"]["batch_size"]=batch_size
|
306 |
+
data["train"]["epochs"]=total_epoch
|
307 |
+
data["pretrained_s1"]=pretrained_s1
|
308 |
+
data["train"]["save_every_n_epoch"]=save_every_epoch
|
309 |
+
data["train"]["if_save_every_weights"]=if_save_every_weights
|
310 |
+
data["train"]["if_save_latest"]=if_save_latest
|
311 |
+
data["train"]["if_dpo"]=if_dpo
|
312 |
+
data["train"]["half_weights_save_dir"]=GPT_weight_root
|
313 |
+
data["train"]["exp_name"]=exp_name
|
314 |
+
data["train_semantic_path"]="%s/6-name2semantic.tsv"%s1_dir
|
315 |
+
data["train_phoneme_path"]="%s/2-name2text.txt"%s1_dir
|
316 |
+
data["output_dir"]="%s/logs_s1"%s1_dir
|
317 |
+
|
318 |
+
os.environ["_CUDA_VISIBLE_DEVICES"]=gpu_numbers.replace("-",",")
|
319 |
+
os.environ["hz"]="25hz"
|
320 |
+
tmp_config_path="%s/tmp_s1.yaml"%tmp
|
321 |
+
with open(tmp_config_path, "w") as f:f.write(yaml.dump(data, default_flow_style=False))
|
322 |
+
# cmd = '"%s" GPT_SoVITS/s1_train.py --config_file "%s" --train_semantic_path "%s/6-name2semantic.tsv" --train_phoneme_path "%s/2-name2text.txt" --output_dir "%s/logs_s1"'%(python_exec,tmp_config_path,s1_dir,s1_dir,s1_dir)
|
323 |
+
cmd = '"%s" GPT_SoVITS/s1_train.py --config_file "%s" '%(python_exec,tmp_config_path)
|
324 |
+
yield "GPT训练开始:%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True}
|
325 |
+
print(cmd)
|
326 |
+
p_train_GPT = Popen(cmd, shell=True)
|
327 |
+
p_train_GPT.wait()
|
328 |
+
p_train_GPT=None
|
329 |
+
yield "GPT训练完成",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
|
330 |
+
else:
|
331 |
+
yield "已有正在进行的GPT训练任务,需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True}
|
332 |
+
|
333 |
+
def close1Bb():
|
334 |
+
global p_train_GPT
|
335 |
+
if(p_train_GPT!=None):
|
336 |
+
kill_process(p_train_GPT.pid)
|
337 |
+
p_train_GPT=None
|
338 |
+
return "已终止GPT训练",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
|
339 |
+
|
340 |
+
ps_slice=[]
|
341 |
+
def open_slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,n_parts):
|
342 |
+
global ps_slice
|
343 |
+
inp = my_utils.clean_path(inp)
|
344 |
+
opt_root = my_utils.clean_path(opt_root)
|
345 |
+
if(os.path.exists(inp)==False):
|
346 |
+
yield "输入路径不存在",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
|
347 |
+
return
|
348 |
+
if os.path.isfile(inp):n_parts=1
|
349 |
+
elif os.path.isdir(inp):pass
|
350 |
+
else:
|
351 |
+
yield "输入路径存在但既不是文件也不是文件夹",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
|
352 |
+
return
|
353 |
+
if (ps_slice == []):
|
354 |
+
for i_part in range(n_parts):
|
355 |
+
cmd = '"%s" tools/slice_audio.py "%s" "%s" %s %s %s %s %s %s %s %s %s''' % (python_exec,inp, opt_root, threshold, min_length, min_interval, hop_size, max_sil_kept, _max, alpha, i_part, n_parts)
|
356 |
+
print(cmd)
|
357 |
+
p = Popen(cmd, shell=True)
|
358 |
+
ps_slice.append(p)
|
359 |
+
yield "切割执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
|
360 |
+
for p in ps_slice:
|
361 |
+
p.wait()
|
362 |
+
ps_slice=[]
|
363 |
+
yield "切割结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
|
364 |
+
else:
|
365 |
+
yield "已有正在进行的切割任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
|
366 |
+
|
367 |
+
def close_slice():
|
368 |
+
global ps_slice
|
369 |
+
if (ps_slice != []):
|
370 |
+
for p_slice in ps_slice:
|
371 |
+
try:
|
372 |
+
kill_process(p_slice.pid)
|
373 |
+
except:
|
374 |
+
traceback.print_exc()
|
375 |
+
ps_slice=[]
|
376 |
+
return "已终止所有切割进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
|
377 |
+
|
378 |
+
ps1a=[]
|
379 |
+
def open1a(inp_text,inp_wav_dir,exp_name,gpu_numbers,bert_pretrained_dir):
|
380 |
+
global ps1a
|
381 |
+
inp_text = my_utils.clean_path(inp_text)
|
382 |
+
inp_wav_dir = my_utils.clean_path(inp_wav_dir)
|
383 |
+
if (ps1a == []):
|
384 |
+
opt_dir="%s/%s"%(exp_root,exp_name)
|
385 |
+
config={
|
386 |
+
"inp_text":inp_text,
|
387 |
+
"inp_wav_dir":inp_wav_dir,
|
388 |
+
"exp_name":exp_name,
|
389 |
+
"opt_dir":opt_dir,
|
390 |
+
"bert_pretrained_dir":bert_pretrained_dir,
|
391 |
+
}
|
392 |
+
gpu_names=gpu_numbers.split("-")
|
393 |
+
all_parts=len(gpu_names)
|
394 |
+
for i_part in range(all_parts):
|
395 |
+
config.update(
|
396 |
+
{
|
397 |
+
"i_part": str(i_part),
|
398 |
+
"all_parts": str(all_parts),
|
399 |
+
"_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
|
400 |
+
"is_half": str(is_half)
|
401 |
+
}
|
402 |
+
)
|
403 |
+
os.environ.update(config)
|
404 |
+
cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py'%python_exec
|
405 |
+
print(cmd)
|
406 |
+
p = Popen(cmd, shell=True)
|
407 |
+
ps1a.append(p)
|
408 |
+
yield "文本进程执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
|
409 |
+
for p in ps1a:
|
410 |
+
p.wait()
|
411 |
+
opt = []
|
412 |
+
for i_part in range(all_parts):
|
413 |
+
txt_path = "%s/2-name2text-%s.txt" % (opt_dir, i_part)
|
414 |
+
with open(txt_path, "r", encoding="utf8") as f:
|
415 |
+
opt += f.read().strip("\n").split("\n")
|
416 |
+
os.remove(txt_path)
|
417 |
+
path_text = "%s/2-name2text.txt" % opt_dir
|
418 |
+
with open(path_text, "w", encoding="utf8") as f:
|
419 |
+
f.write("\n".join(opt) + "\n")
|
420 |
+
ps1a=[]
|
421 |
+
yield "文本进程结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
|
422 |
+
else:
|
423 |
+
yield "已有正在进行的文本任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
|
424 |
+
|
425 |
+
def close1a():
|
426 |
+
global ps1a
|
427 |
+
if (ps1a != []):
|
428 |
+
for p1a in ps1a:
|
429 |
+
try:
|
430 |
+
kill_process(p1a.pid)
|
431 |
+
except:
|
432 |
+
traceback.print_exc()
|
433 |
+
ps1a=[]
|
434 |
+
return "已终止所有1a进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
|
435 |
+
|
436 |
+
ps1b=[]
|
437 |
+
def open1b(inp_text,inp_wav_dir,exp_name,gpu_numbers,ssl_pretrained_dir):
|
438 |
+
global ps1b
|
439 |
+
inp_text = my_utils.clean_path(inp_text)
|
440 |
+
inp_wav_dir = my_utils.clean_path(inp_wav_dir)
|
441 |
+
if (ps1b == []):
|
442 |
+
config={
|
443 |
+
"inp_text":inp_text,
|
444 |
+
"inp_wav_dir":inp_wav_dir,
|
445 |
+
"exp_name":exp_name,
|
446 |
+
"opt_dir":"%s/%s"%(exp_root,exp_name),
|
447 |
+
"cnhubert_base_dir":ssl_pretrained_dir,
|
448 |
+
"is_half": str(is_half)
|
449 |
+
}
|
450 |
+
gpu_names=gpu_numbers.split("-")
|
451 |
+
all_parts=len(gpu_names)
|
452 |
+
for i_part in range(all_parts):
|
453 |
+
config.update(
|
454 |
+
{
|
455 |
+
"i_part": str(i_part),
|
456 |
+
"all_parts": str(all_parts),
|
457 |
+
"_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
|
458 |
+
}
|
459 |
+
)
|
460 |
+
os.environ.update(config)
|
461 |
+
cmd = '"%s" GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py'%python_exec
|
462 |
+
print(cmd)
|
463 |
+
p = Popen(cmd, shell=True)
|
464 |
+
ps1b.append(p)
|
465 |
+
yield "SSL提取进程执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
|
466 |
+
for p in ps1b:
|
467 |
+
p.wait()
|
468 |
+
ps1b=[]
|
469 |
+
yield "SSL提取进程结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
|
470 |
+
else:
|
471 |
+
yield "已有正在进行的SSL提取任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
|
472 |
+
|
473 |
+
def close1b():
|
474 |
+
global ps1b
|
475 |
+
if (ps1b != []):
|
476 |
+
for p1b in ps1b:
|
477 |
+
try:
|
478 |
+
kill_process(p1b.pid)
|
479 |
+
except:
|
480 |
+
traceback.print_exc()
|
481 |
+
ps1b=[]
|
482 |
+
return "已终止所有1b进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
|
483 |
+
|
484 |
+
ps1c=[]
|
485 |
+
def open1c(inp_text,exp_name,gpu_numbers,pretrained_s2G_path):
|
486 |
+
global ps1c
|
487 |
+
inp_text = my_utils.clean_path(inp_text)
|
488 |
+
if (ps1c == []):
|
489 |
+
opt_dir="%s/%s"%(exp_root,exp_name)
|
490 |
+
config={
|
491 |
+
"inp_text":inp_text,
|
492 |
+
"exp_name":exp_name,
|
493 |
+
"opt_dir":opt_dir,
|
494 |
+
"pretrained_s2G":pretrained_s2G_path,
|
495 |
+
"s2config_path":"GPT_SoVITS/configs/s2.json",
|
496 |
+
"is_half": str(is_half)
|
497 |
+
}
|
498 |
+
gpu_names=gpu_numbers.split("-")
|
499 |
+
all_parts=len(gpu_names)
|
500 |
+
for i_part in range(all_parts):
|
501 |
+
config.update(
|
502 |
+
{
|
503 |
+
"i_part": str(i_part),
|
504 |
+
"all_parts": str(all_parts),
|
505 |
+
"_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
|
506 |
+
}
|
507 |
+
)
|
508 |
+
os.environ.update(config)
|
509 |
+
cmd = '"%s" GPT_SoVITS/prepare_datasets/3-get-semantic.py'%python_exec
|
510 |
+
print(cmd)
|
511 |
+
p = Popen(cmd, shell=True)
|
512 |
+
ps1c.append(p)
|
513 |
+
yield "语义token提取进程执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
|
514 |
+
for p in ps1c:
|
515 |
+
p.wait()
|
516 |
+
opt = ["item_name\tsemantic_audio"]
|
517 |
+
path_semantic = "%s/6-name2semantic.tsv" % opt_dir
|
518 |
+
for i_part in range(all_parts):
|
519 |
+
semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part)
|
520 |
+
with open(semantic_path, "r", encoding="utf8") as f:
|
521 |
+
opt += f.read().strip("\n").split("\n")
|
522 |
+
os.remove(semantic_path)
|
523 |
+
with open(path_semantic, "w", encoding="utf8") as f:
|
524 |
+
f.write("\n".join(opt) + "\n")
|
525 |
+
ps1c=[]
|
526 |
+
yield "语义token提取进程结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
|
527 |
+
else:
|
528 |
+
yield "已有正在��行的语义token提取任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
|
529 |
+
|
530 |
+
def close1c():
|
531 |
+
global ps1c
|
532 |
+
if (ps1c != []):
|
533 |
+
for p1c in ps1c:
|
534 |
+
try:
|
535 |
+
kill_process(p1c.pid)
|
536 |
+
except:
|
537 |
+
traceback.print_exc()
|
538 |
+
ps1c=[]
|
539 |
+
return "已终止所有语义token进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
|
540 |
+
#####inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numbers1c,bert_pretrained_dir,cnhubert_base_dir,pretrained_s2G
|
541 |
+
ps1abc=[]
|
542 |
+
def open1abc(inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numbers1c,bert_pretrained_dir,ssl_pretrained_dir,pretrained_s2G_path):
|
543 |
+
global ps1abc
|
544 |
+
inp_text = my_utils.clean_path(inp_text)
|
545 |
+
inp_wav_dir = my_utils.clean_path(inp_wav_dir)
|
546 |
+
if (ps1abc == []):
|
547 |
+
opt_dir="%s/%s"%(exp_root,exp_name)
|
548 |
+
try:
|
549 |
+
#############################1a
|
550 |
+
path_text="%s/2-name2text.txt" % opt_dir
|
551 |
+
if(os.path.exists(path_text)==False or (os.path.exists(path_text)==True and len(open(path_text,"r",encoding="utf8").read().strip("\n").split("\n"))<2)):
|
552 |
+
config={
|
553 |
+
"inp_text":inp_text,
|
554 |
+
"inp_wav_dir":inp_wav_dir,
|
555 |
+
"exp_name":exp_name,
|
556 |
+
"opt_dir":opt_dir,
|
557 |
+
"bert_pretrained_dir":bert_pretrained_dir,
|
558 |
+
"is_half": str(is_half)
|
559 |
+
}
|
560 |
+
gpu_names=gpu_numbers1a.split("-")
|
561 |
+
all_parts=len(gpu_names)
|
562 |
+
for i_part in range(all_parts):
|
563 |
+
config.update(
|
564 |
+
{
|
565 |
+
"i_part": str(i_part),
|
566 |
+
"all_parts": str(all_parts),
|
567 |
+
"_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
|
568 |
+
}
|
569 |
+
)
|
570 |
+
os.environ.update(config)
|
571 |
+
cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py'%python_exec
|
572 |
+
print(cmd)
|
573 |
+
p = Popen(cmd, shell=True)
|
574 |
+
ps1abc.append(p)
|
575 |
+
yield "进度:1a-ing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
|
576 |
+
for p in ps1abc:p.wait()
|
577 |
+
|
578 |
+
opt = []
|
579 |
+
for i_part in range(all_parts):#txt_path="%s/2-name2text-%s.txt"%(opt_dir,i_part)
|
580 |
+
txt_path = "%s/2-name2text-%s.txt" % (opt_dir, i_part)
|
581 |
+
with open(txt_path, "r",encoding="utf8") as f:
|
582 |
+
opt += f.read().strip("\n").split("\n")
|
583 |
+
os.remove(txt_path)
|
584 |
+
with open(path_text, "w",encoding="utf8") as f:
|
585 |
+
f.write("\n".join(opt) + "\n")
|
586 |
+
|
587 |
+
yield "进度:1a-done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
|
588 |
+
ps1abc=[]
|
589 |
+
#############################1b
|
590 |
+
config={
|
591 |
+
"inp_text":inp_text,
|
592 |
+
"inp_wav_dir":inp_wav_dir,
|
593 |
+
"exp_name":exp_name,
|
594 |
+
"opt_dir":opt_dir,
|
595 |
+
"cnhubert_base_dir":ssl_pretrained_dir,
|
596 |
+
}
|
597 |
+
gpu_names=gpu_numbers1Ba.split("-")
|
598 |
+
all_parts=len(gpu_names)
|
599 |
+
for i_part in range(all_parts):
|
600 |
+
config.update(
|
601 |
+
{
|
602 |
+
"i_part": str(i_part),
|
603 |
+
"all_parts": str(all_parts),
|
604 |
+
"_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
|
605 |
+
}
|
606 |
+
)
|
607 |
+
os.environ.update(config)
|
608 |
+
cmd = '"%s" GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py'%python_exec
|
609 |
+
print(cmd)
|
610 |
+
p = Popen(cmd, shell=True)
|
611 |
+
ps1abc.append(p)
|
612 |
+
yield "进度:1a-done, 1b-ing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
|
613 |
+
for p in ps1abc:p.wait()
|
614 |
+
yield "进度:1a1b-done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
|
615 |
+
ps1abc=[]
|
616 |
+
#############################1c
|
617 |
+
path_semantic = "%s/6-name2semantic.tsv" % opt_dir
|
618 |
+
if(os.path.exists(path_semantic)==False or (os.path.exists(path_semantic)==True and os.path.getsize(path_semantic)<31)):
|
619 |
+
config={
|
620 |
+
"inp_text":inp_text,
|
621 |
+
"exp_name":exp_name,
|
622 |
+
"opt_dir":opt_dir,
|
623 |
+
"pretrained_s2G":pretrained_s2G_path,
|
624 |
+
"s2config_path":"GPT_SoVITS/configs/s2.json",
|
625 |
+
}
|
626 |
+
gpu_names=gpu_numbers1c.split("-")
|
627 |
+
all_parts=len(gpu_names)
|
628 |
+
for i_part in range(all_parts):
|
629 |
+
config.update(
|
630 |
+
{
|
631 |
+
"i_part": str(i_part),
|
632 |
+
"all_parts": str(all_parts),
|
633 |
+
"_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
|
634 |
+
}
|
635 |
+
)
|
636 |
+
os.environ.update(config)
|
637 |
+
cmd = '"%s" GPT_SoVITS/prepare_datasets/3-get-semantic.py'%python_exec
|
638 |
+
print(cmd)
|
639 |
+
p = Popen(cmd, shell=True)
|
640 |
+
ps1abc.append(p)
|
641 |
+
yield "进度:1a1b-done, 1cing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
|
642 |
+
for p in ps1abc:p.wait()
|
643 |
+
|
644 |
+
opt = ["item_name\tsemantic_audio"]
|
645 |
+
for i_part in range(all_parts):
|
646 |
+
semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part)
|
647 |
+
with open(semantic_path, "r",encoding="utf8") as f:
|
648 |
+
opt += f.read().strip("\n").split("\n")
|
649 |
+
os.remove(semantic_path)
|
650 |
+
with open(path_semantic, "w",encoding="utf8") as f:
|
651 |
+
f.write("\n".join(opt) + "\n")
|
652 |
+
yield "进度:all-done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
|
653 |
+
ps1abc = []
|
654 |
+
yield "一键三连进程结束", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
|
655 |
+
except:
|
656 |
+
traceback.print_exc()
|
657 |
+
close1abc()
|
658 |
+
yield "一键三连中途报错", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
|
659 |
+
else:
|
660 |
+
yield "已有正在进行的一键三连任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
|
661 |
+
|
662 |
+
def close1abc():
|
663 |
+
global ps1abc
|
664 |
+
if (ps1abc != []):
|
665 |
+
for p1abc in ps1abc:
|
666 |
+
try:
|
667 |
+
kill_process(p1abc.pid)
|
668 |
+
except:
|
669 |
+
traceback.print_exc()
|
670 |
+
ps1abc=[]
|
671 |
+
return "已终止所有一键三连进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
|
672 |
+
|
673 |
+
with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
674 |
+
gr.Markdown(
|
675 |
+
value=
|
676 |
+
i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>.")
|
677 |
+
)
|
678 |
+
gr.Markdown(
|
679 |
+
value=
|
680 |
+
i18n("中文教程文档:https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e")
|
681 |
+
)
|
682 |
+
|
683 |
+
with gr.Tabs():
|
684 |
+
with gr.TabItem(i18n("0-前置数据集获取工具")):#提前随机切片防止uvr5爆内存->uvr5->slicer->asr->打标
|
685 |
+
gr.Markdown(value=i18n("0a-UVR5人声伴奏分离&去混响去延迟工具"))
|
686 |
+
with gr.Row():
|
687 |
+
if_uvr5 = gr.Checkbox(label=i18n("是否开启UVR5-WebUI"),show_label=True)
|
688 |
+
uvr5_info = gr.Textbox(label=i18n("UVR5进程输出信息"))
|
689 |
+
gr.Markdown(value=i18n("0b-语音切分工具"))
|
690 |
+
with gr.Row():
|
691 |
+
with gr.Row():
|
692 |
+
slice_inp_path=gr.Textbox(label=i18n("音频自动切分输入路径,可文件可文件夹"),value="")
|
693 |
+
slice_opt_root=gr.Textbox(label=i18n("切分后的子音频的输出根目录"),value="output/slicer_opt")
|
694 |
+
threshold=gr.Textbox(label=i18n("threshold:音量小于这个值视作静音的备选切割点"),value="-34")
|
695 |
+
min_length=gr.Textbox(label=i18n("min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值"),value="4000")
|
696 |
+
min_interval=gr.Textbox(label=i18n("min_interval:最短切割间隔"),value="300")
|
697 |
+
hop_size=gr.Textbox(label=i18n("hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)"),value="10")
|
698 |
+
max_sil_kept=gr.Textbox(label=i18n("max_sil_kept:切完后静音最多留多长"),value="500")
|
699 |
+
with gr.Row():
|
700 |
+
open_slicer_button=gr.Button(i18n("开启语音切割"), variant="primary",visible=True)
|
701 |
+
close_slicer_button=gr.Button(i18n("终止语音切割"), variant="primary",visible=False)
|
702 |
+
_max=gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("max:归一化后最大值多少"),value=0.9,interactive=True)
|
703 |
+
alpha=gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("alpha_mix:混多少比例归一化后音频进来"),value=0.25,interactive=True)
|
704 |
+
n_process=gr.Slider(minimum=1,maximum=n_cpu,step=1,label=i18n("切割使用的进程数"),value=4,interactive=True)
|
705 |
+
slicer_info = gr.Textbox(label=i18n("语音切割进���输出信息"))
|
706 |
+
gr.Markdown(value=i18n("0bb-语音降噪工具"))
|
707 |
+
with gr.Row():
|
708 |
+
open_denoise_button = gr.Button(i18n("开启语音降噪"), variant="primary",visible=True)
|
709 |
+
close_denoise_button = gr.Button(i18n("终止语音降噪进程"), variant="primary",visible=False)
|
710 |
+
denoise_input_dir=gr.Textbox(label=i18n("降噪音频文件输入文件夹"),value="")
|
711 |
+
denoise_output_dir=gr.Textbox(label=i18n("降噪结果输出文件夹"),value="output/denoise_opt")
|
712 |
+
denoise_info = gr.Textbox(label=i18n("语音降噪进程输出信息"))
|
713 |
+
gr.Markdown(value=i18n("0c-中文批量离线ASR工具"))
|
714 |
+
with gr.Row():
|
715 |
+
open_asr_button = gr.Button(i18n("开启离线批量ASR"), variant="primary",visible=True)
|
716 |
+
close_asr_button = gr.Button(i18n("终止ASR进程"), variant="primary",visible=False)
|
717 |
+
with gr.Column():
|
718 |
+
with gr.Row():
|
719 |
+
asr_inp_dir = gr.Textbox(
|
720 |
+
label=i18n("输入文件夹路径"),
|
721 |
+
value="D:\\GPT-SoVITS\\raw\\xxx",
|
722 |
+
interactive=True,
|
723 |
+
)
|
724 |
+
asr_opt_dir = gr.Textbox(
|
725 |
+
label = i18n("输出文件夹路径"),
|
726 |
+
value = "output/asr_opt",
|
727 |
+
interactive = True,
|
728 |
+
)
|
729 |
+
with gr.Row():
|
730 |
+
asr_model = gr.Dropdown(
|
731 |
+
label = i18n("ASR 模型"),
|
732 |
+
choices = list(asr_dict.keys()),
|
733 |
+
interactive = True,
|
734 |
+
value="达摩 ASR (中文)"
|
735 |
+
)
|
736 |
+
asr_size = gr.Dropdown(
|
737 |
+
label = i18n("ASR 模型尺寸"),
|
738 |
+
choices = ["large"],
|
739 |
+
interactive = True,
|
740 |
+
value="large"
|
741 |
+
)
|
742 |
+
asr_lang = gr.Dropdown(
|
743 |
+
label = i18n("ASR 语言设置"),
|
744 |
+
choices = ["zh"],
|
745 |
+
interactive = True,
|
746 |
+
value="zh"
|
747 |
+
)
|
748 |
+
with gr.Row():
|
749 |
+
asr_info = gr.Textbox(label=i18n("ASR进程输出信息"))
|
750 |
+
|
751 |
+
def change_lang_choices(key): #根据选择的模型修改可选的语言
|
752 |
+
# return gr.Dropdown(choices=asr_dict[key]['lang'])
|
753 |
+
return {"__type__": "update", "choices": asr_dict[key]['lang'],"value":asr_dict[key]['lang'][0]}
|
754 |
+
def change_size_choices(key): # 根据选择的模型修改可选的模型尺寸
|
755 |
+
# return gr.Dropdown(choices=asr_dict[key]['size'])
|
756 |
+
return {"__type__": "update", "choices": asr_dict[key]['size']}
|
757 |
+
asr_model.change(change_lang_choices, [asr_model], [asr_lang])
|
758 |
+
asr_model.change(change_size_choices, [asr_model], [asr_size])
|
759 |
+
|
760 |
+
gr.Markdown(value=i18n("0d-语音文本校对标注工具"))
|
761 |
+
with gr.Row():
|
762 |
+
if_label = gr.Checkbox(label=i18n("是否开启打标WebUI"),show_label=True)
|
763 |
+
path_list = gr.Textbox(
|
764 |
+
label=i18n(".list标注文件的路径"),
|
765 |
+
value="D:\\RVC1006\\GPT-SoVITS\\raw\\xxx.list",
|
766 |
+
interactive=True,
|
767 |
+
)
|
768 |
+
label_info = gr.Textbox(label=i18n("打标工具进程输出信息"))
|
769 |
+
if_label.change(change_label, [if_label,path_list], [label_info])
|
770 |
+
if_uvr5.change(change_uvr5, [if_uvr5], [uvr5_info])
|
771 |
+
open_asr_button.click(open_asr, [asr_inp_dir, asr_opt_dir, asr_model, asr_size, asr_lang], [asr_info,open_asr_button,close_asr_button])
|
772 |
+
close_asr_button.click(close_asr, [], [asr_info,open_asr_button,close_asr_button])
|
773 |
+
open_slicer_button.click(open_slice, [slice_inp_path,slice_opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,n_process], [slicer_info,open_slicer_button,close_slicer_button])
|
774 |
+
close_slicer_button.click(close_slice, [], [slicer_info,open_slicer_button,close_slicer_button])
|
775 |
+
open_denoise_button.click(open_denoise, [denoise_input_dir,denoise_output_dir], [denoise_info,open_denoise_button,close_denoise_button])
|
776 |
+
close_denoise_button.click(close_denoise, [], [denoise_info,open_denoise_button,close_denoise_button])
|
777 |
+
|
778 |
+
with gr.TabItem(i18n("1-GPT-SoVITS-TTS")):
|
779 |
+
with gr.Row():
|
780 |
+
exp_name = gr.Textbox(label=i18n("*实验/模型名"), value="xxx", interactive=True)
|
781 |
+
gpu_info = gr.Textbox(label=i18n("显卡信息"), value=gpu_info, visible=True, interactive=False)
|
782 |
+
pretrained_s2G = gr.Textbox(label=i18n("预训练的SoVITS-G模型路径"), value="GPT_SoVITS/pretrained_models/s2G488k.pth", interactive=True)
|
783 |
+
pretrained_s2D = gr.Textbox(label=i18n("预训练的SoVITS-D模型路径"), value="GPT_SoVITS/pretrained_models/s2D488k.pth", interactive=True)
|
784 |
+
pretrained_s1 = gr.Textbox(label=i18n("预训练的GPT模型路径"), value="GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt", interactive=True)
|
785 |
+
with gr.TabItem(i18n("1A-训练集格式化工具")):
|
786 |
+
gr.Markdown(value=i18n("输出logs/实验名目录下应有23456开头的文件和文件夹"))
|
787 |
+
with gr.Row():
|
788 |
+
inp_text = gr.Textbox(label=i18n("*文本标注文件"),value=r"D:\RVC1006\GPT-SoVITS\raw\xxx.list",interactive=True)
|
789 |
+
inp_wav_dir = gr.Textbox(
|
790 |
+
label=i18n("*训练集音频文件目录"),
|
791 |
+
# value=r"D:\RVC1006\GPT-SoVITS\raw\xxx",
|
792 |
+
interactive=True,
|
793 |
+
placeholder=i18n("填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名(不是全路径)。如果留空则使用.list文件里的绝对全路径。")
|
794 |
+
)
|
795 |
+
gr.Markdown(value=i18n("1Aa-文本内容"))
|
796 |
+
with gr.Row():
|
797 |
+
gpu_numbers1a = gr.Textbox(label=i18n("GPU卡号以-分割,每个卡号一个进程"),value="%s-%s"%(gpus,gpus),interactive=True)
|
798 |
+
bert_pretrained_dir = gr.Textbox(label=i18n("预训练的中文BERT模型路径"),value="GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",interactive=False)
|
799 |
+
button1a_open = gr.Button(i18n("开启文本获取"), variant="primary",visible=True)
|
800 |
+
button1a_close = gr.Button(i18n("终止文本获取进程"), variant="primary",visible=False)
|
801 |
+
info1a=gr.Textbox(label=i18n("文本进程输出信息"))
|
802 |
+
gr.Markdown(value=i18n("1Ab-SSL自监督特征提取"))
|
803 |
+
with gr.Row():
|
804 |
+
gpu_numbers1Ba = gr.Textbox(label=i18n("GPU卡号以-分割,每个卡号一个进程"),value="%s-%s"%(gpus,gpus),interactive=True)
|
805 |
+
cnhubert_base_dir = gr.Textbox(label=i18n("预训练的SSL模型路径"),value="GPT_SoVITS/pretrained_models/chinese-hubert-base",interactive=False)
|
806 |
+
button1b_open = gr.Button(i18n("开启SSL提取"), variant="primary",visible=True)
|
807 |
+
button1b_close = gr.Button(i18n("终止SSL提取进程"), variant="primary",visible=False)
|
808 |
+
info1b=gr.Textbox(label=i18n("SSL进程输出信息"))
|
809 |
+
gr.Markdown(value=i18n("1Ac-语义token提取"))
|
810 |
+
with gr.Row():
|
811 |
+
gpu_numbers1c = gr.Textbox(label=i18n("GPU卡号以-分割,每个卡号一个进程"),value="%s-%s"%(gpus,gpus),interactive=True)
|
812 |
+
button1c_open = gr.Button(i18n("开启语义token提取"), variant="primary",visible=True)
|
813 |
+
button1c_close = gr.Button(i18n("终止语义token提取进程"), variant="primary",visible=False)
|
814 |
+
info1c=gr.Textbox(label=i18n("语义token提取进程输出信息"))
|
815 |
+
gr.Markdown(value=i18n("1Aabc-训练集格式化一键三连"))
|
816 |
+
with gr.Row():
|
817 |
+
button1abc_open = gr.Button(i18n("开启一键三连"), variant="primary",visible=True)
|
818 |
+
button1abc_close = gr.Button(i18n("终止一键三连"), variant="primary",visible=False)
|
819 |
+
info1abc=gr.Textbox(label=i18n("一键三连进程输出信息"))
|
820 |
+
button1a_open.click(open1a, [inp_text,inp_wav_dir,exp_name,gpu_numbers1a,bert_pretrained_dir], [info1a,button1a_open,button1a_close])
|
821 |
+
button1a_close.click(close1a, [], [info1a,button1a_open,button1a_close])
|
822 |
+
button1b_open.click(open1b, [inp_text,inp_wav_dir,exp_name,gpu_numbers1Ba,cnhubert_base_dir], [info1b,button1b_open,button1b_close])
|
823 |
+
button1b_close.click(close1b, [], [info1b,button1b_open,button1b_close])
|
824 |
+
button1c_open.click(open1c, [inp_text,exp_name,gpu_numbers1c,pretrained_s2G], [info1c,button1c_open,button1c_close])
|
825 |
+
button1c_close.click(close1c, [], [info1c,button1c_open,button1c_close])
|
826 |
+
button1abc_open.click(open1abc, [inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numbers1c,bert_pretrained_dir,cnhubert_base_dir,pretrained_s2G], [info1abc,button1abc_open,button1abc_close])
|
827 |
+
button1abc_close.click(close1abc, [], [info1abc,button1abc_open,button1abc_close])
|
828 |
+
with gr.TabItem(i18n("1B-微调训练")):
|
829 |
+
gr.Markdown(value=i18n("1Ba-SoVITS训练。用于分享的模型文件输出在SoVITS_weights下。"))
|
830 |
+
with gr.Row():
|
831 |
+
batch_size = gr.Slider(minimum=1,maximum=40,step=1,label=i18n("每张显卡的batch_size"),value=default_batch_size,interactive=True)
|
832 |
+
total_epoch = gr.Slider(minimum=1,maximum=25,step=1,label=i18n("总训练轮数total_epoch,不建议太高"),value=8,interactive=True)
|
833 |
+
text_low_lr_rate = gr.Slider(minimum=0.2,maximum=0.6,step=0.05,label=i18n("文本模块学习率权重"),value=0.4,interactive=True)
|
834 |
+
save_every_epoch = gr.Slider(minimum=1,maximum=25,step=1,label=i18n("保存频率save_every_epoch"),value=4,interactive=True)
|
835 |
+
if_save_latest = gr.Checkbox(label=i18n("是否仅保存最新的ckpt文件以节省硬盘空间"), value=True, interactive=True, show_label=True)
|
836 |
+
if_save_every_weights = gr.Checkbox(label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"), value=True, interactive=True, show_label=True)
|
837 |
+
gpu_numbers1Ba = gr.Textbox(label=i18n("GPU卡号以-分割,每个卡号一个进程"), value="%s" % (gpus), interactive=True)
|
838 |
+
with gr.Row():
|
839 |
+
button1Ba_open = gr.Button(i18n("开启SoVITS训练"), variant="primary",visible=True)
|
840 |
+
button1Ba_close = gr.Button(i18n("终止SoVITS训练"), variant="primary",visible=False)
|
841 |
+
info1Ba=gr.Textbox(label=i18n("SoVITS训练进程输出信息"))
|
842 |
+
gr.Markdown(value=i18n("1Bb-GPT训练。用于分享的模型文件输出在GPT_weights下。"))
|
843 |
+
with gr.Row():
|
844 |
+
batch_size1Bb = gr.Slider(minimum=1,maximum=40,step=1,label=i18n("每张显卡的batch_size"),value=default_batch_size,interactive=True)
|
845 |
+
total_epoch1Bb = gr.Slider(minimum=2,maximum=50,step=1,label=i18n("总训练轮数total_epoch"),value=15,interactive=True)
|
846 |
+
if_dpo = gr.Checkbox(label=i18n("是否开启dpo训练选项(实验性)"), value=False, interactive=True, show_label=True)
|
847 |
+
if_save_latest1Bb = gr.Checkbox(label=i18n("是否仅保存最新的ckpt文件以节省硬盘空间"), value=True, interactive=True, show_label=True)
|
848 |
+
if_save_every_weights1Bb = gr.Checkbox(label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"), value=True, interactive=True, show_label=True)
|
849 |
+
save_every_epoch1Bb = gr.Slider(minimum=1,maximum=50,step=1,label=i18n("保存频率save_every_epoch"),value=5,interactive=True)
|
850 |
+
gpu_numbers1Bb = gr.Textbox(label=i18n("GPU卡号以-分割,每个卡号一个进程"), value="%s" % (gpus), interactive=True)
|
851 |
+
with gr.Row():
|
852 |
+
button1Bb_open = gr.Button(i18n("开启GPT训练"), variant="primary",visible=True)
|
853 |
+
button1Bb_close = gr.Button(i18n("终止GPT训练"), variant="primary",visible=False)
|
854 |
+
info1Bb=gr.Textbox(label=i18n("GPT训练进程输出信息"))
|
855 |
+
button1Ba_open.click(open1Ba, [batch_size,total_epoch,exp_name,text_low_lr_rate,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers1Ba,pretrained_s2G,pretrained_s2D], [info1Ba,button1Ba_open,button1Ba_close])
|
856 |
+
button1Ba_close.click(close1Ba, [], [info1Ba,button1Ba_open,button1Ba_close])
|
857 |
+
button1Bb_open.click(open1Bb, [batch_size1Bb,total_epoch1Bb,exp_name,if_dpo,if_save_latest1Bb,if_save_every_weights1Bb,save_every_epoch1Bb,gpu_numbers1Bb,pretrained_s1], [info1Bb,button1Bb_open,button1Bb_close])
|
858 |
+
button1Bb_close.click(close1Bb, [], [info1Bb,button1Bb_open,button1Bb_close])
|
859 |
+
with gr.TabItem(i18n("1C-推理")):
|
860 |
+
gr.Markdown(value=i18n("选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。"))
|
861 |
+
with gr.Row():
|
862 |
+
GPT_dropdown = gr.Dropdown(label=i18n("*GPT模型列表"), choices=sorted(GPT_names,key=custom_sort_key),value=pretrained_gpt_name,interactive=True)
|
863 |
+
SoVITS_dropdown = gr.Dropdown(label=i18n("*SoVITS模型列表"), choices=sorted(SoVITS_names,key=custom_sort_key),value=pretrained_sovits_name,interactive=True)
|
864 |
+
gpu_number_1C=gr.Textbox(label=i18n("GPU卡号,只能填1个整数"), value=gpus, interactive=True)
|
865 |
+
refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary")
|
866 |
+
refresh_button.click(fn=change_choices,inputs=[],outputs=[SoVITS_dropdown,GPT_dropdown])
|
867 |
+
with gr.Row():
|
868 |
+
if_tts = gr.Checkbox(label=i18n("是否开启TTS推理WebUI"), show_label=True)
|
869 |
+
tts_info = gr.Textbox(label=i18n("TTS推理WebUI进程输出信息"))
|
870 |
+
if_tts.change(change_tts_inference, [if_tts,bert_pretrained_dir,cnhubert_base_dir,gpu_number_1C,GPT_dropdown,SoVITS_dropdown], [tts_info])
|
871 |
+
with gr.TabItem(i18n("2-GPT-SoVITS-变声")):gr.Markdown(value=i18n("���工中,请静候佳音"))
|
872 |
+
app.queue(concurrency_count=511, max_size=1022).launch(
|
873 |
+
server_name="0.0.0.0",
|
874 |
+
inbrowser=True,
|
875 |
+
share=is_share,
|
876 |
+
server_port=webui_port_main,
|
877 |
+
quiet=True,
|
878 |
+
)
|