jianuo commited on
Commit
c02528e
1 Parent(s): 454c9e4

支持输出报错的函数调用栈,并用更优雅的方式切换本地推理时的工作目录

Browse files
TTSs/base_tts.py CHANGED
@@ -3,6 +3,8 @@ import io
3
  import scipy.io.wavfile as wavfile
4
  from pydub import AudioSegment
5
 
 
 
6
  import gradio as gr
7
 
8
 
@@ -136,6 +138,8 @@ class Base_TTS(metaclass=abc.ABCMeta):
136
 
137
  return None, *mix_background_music(original_audio, 背景音乐, TTS_up, bg_up)
138
  except Exception as e:
139
- return str(e), None, None
 
 
140
 
141
 
 
3
  import scipy.io.wavfile as wavfile
4
  from pydub import AudioSegment
5
 
6
+ import traceback
7
+
8
  import gradio as gr
9
 
10
 
 
138
 
139
  return None, *mix_background_music(original_audio, 背景音乐, TTS_up, bg_up)
140
  except Exception as e:
141
+ msg = traceback.format_exc()
142
+
143
+ return msg + '\n\n' + str(e), None, None
144
 
145
 
TTSs/genshin_local/genshin_bg.py CHANGED
@@ -1,161 +1,168 @@
1
  # flake8: noqa: E402
2
  import os
3
  import sys
 
4
 
5
- genshin_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'genshin')
6
- sys.path.append(os.path.abspath(os.path.dirname(__file__)))
7
- sys.path.append(genshin_path)
8
- os.chdir(genshin_path)
9
-
10
- import genshin.re_matching as re_matching
11
- from genshin.tools.sentence import split_by_language
12
-
13
- import torch
14
- import genshin.utils as utils
15
- from genshin.infer import infer, latest_version, get_net_g, infer_multilang
16
  import gradio as gr
17
- import numpy as np
18
- from genshin.config import config
19
- from genshin.tools.translate import translate
20
  import librosa
 
 
21
 
22
- net_g = None
23
-
24
- device = config.webui_config.device
25
- if device == "mps":
26
- os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
27
-
28
-
29
- def generate_audio(
30
- slices,
31
- sdp_ratio,
32
- noise_scale,
33
- noise_scale_w,
34
- length_scale,
35
- speaker,
36
- language,
37
- reference_audio,
38
- emotion,
39
- style_text,
40
- style_weight,
41
- skip_start=False,
42
- skip_end=False,
43
- ):
44
- audio_list = []
45
- # silence = np.zeros(hps.data.sampling_rate // 2, dtype=np.int16)
46
- with torch.no_grad():
47
- for idx, piece in enumerate(slices):
48
- skip_start = idx != 0
49
- skip_end = idx != len(slices) - 1
50
- audio = infer(
51
- piece,
52
- reference_audio=reference_audio,
53
- emotion=emotion,
54
- sdp_ratio=sdp_ratio,
55
- noise_scale=noise_scale,
56
- noise_scale_w=noise_scale_w,
57
- length_scale=length_scale,
58
- sid=speaker,
59
- language=language,
60
- hps=hps,
61
- net_g=net_g,
62
- device=device,
63
- skip_start=skip_start,
64
- skip_end=skip_end,
65
- style_text=style_text,
66
- style_weight=style_weight,
67
- )
68
- audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio)
69
- audio_list.append(audio16bit)
70
- return audio_list
71
-
72
-
73
- def generate_audio_multilang(
74
- slices,
75
- sdp_ratio,
76
- noise_scale,
77
- noise_scale_w,
78
- length_scale,
79
- speaker,
80
- language,
81
- reference_audio,
82
- emotion,
83
- skip_start=False,
84
- skip_end=False,
85
- ):
86
- audio_list = []
87
- # silence = np.zeros(hps.data.sampling_rate // 2, dtype=np.int16)
88
- with torch.no_grad():
89
- for idx, piece in enumerate(slices):
90
- skip_start = idx != 0
91
- skip_end = idx != len(slices) - 1
92
- audio = infer_multilang(
93
- piece,
94
- reference_audio=reference_audio,
95
- emotion=emotion,
96
- sdp_ratio=sdp_ratio,
97
- noise_scale=noise_scale,
98
- noise_scale_w=noise_scale_w,
99
- length_scale=length_scale,
100
- sid=speaker,
101
- language=language[idx],
102
- hps=hps,
103
- net_g=net_g,
104
- device=device,
105
- skip_start=skip_start,
106
- skip_end=skip_end,
107
- )
108
- audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio)
109
- audio_list.append(audio16bit)
110
- return audio_list
111
-
112
-
113
- def tts_split(
114
- text: str,
115
- speaker,
116
- sdp_ratio,
117
- noise_scale,
118
- noise_scale_w,
119
- length_scale,
120
- language,
121
- cut_by_sent,
122
- interval_between_para,
123
- interval_between_sent,
124
- reference_audio,
125
- emotion,
126
- style_text,
127
- style_weight,
128
- ):
129
- while text.find("\n\n") != -1:
130
- text = text.replace("\n\n", "\n")
131
- text = text.replace("|", "")
132
- para_list = re_matching.cut_para(text)
133
- para_list = [p for p in para_list if p != ""]
134
- audio_list = []
135
- for p in para_list:
136
- if not cut_by_sent:
137
- audio_list += process_text(
138
- p,
139
- speaker,
140
- sdp_ratio,
141
- noise_scale,
142
- noise_scale_w,
143
- length_scale,
144
- language,
145
- reference_audio,
146
- emotion,
147
- style_text,
148
- style_weight,
149
- )
150
- silence = np.zeros((int)(44100 * interval_between_para), dtype=np.int16)
151
- audio_list.append(silence)
152
- else:
153
- audio_list_sent = []
154
- sent_list = re_matching.cut_sent(p)
155
- sent_list = [s for s in sent_list if s != ""]
156
- for s in sent_list:
157
- audio_list_sent += process_text(
158
- s,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  speaker,
160
  sdp_ratio,
161
  noise_scale,
@@ -167,83 +174,119 @@ def tts_split(
167
  style_text,
168
  style_weight,
169
  )
170
- silence = np.zeros((int)(44100 * interval_between_sent))
171
- audio_list_sent.append(silence)
172
- if (interval_between_para - interval_between_sent) > 0:
173
- silence = np.zeros(
174
- (int)(44100 * (interval_between_para - interval_between_sent))
175
- )
176
- audio_list_sent.append(silence)
177
- audio16bit = gr.processing_utils.convert_to_16_bit_wav(
178
- np.concatenate(audio_list_sent)
179
- ) # 对完整句子做音量归一
180
- audio_list.append(audio16bit)
181
- audio_concat = np.concatenate(audio_list)
182
- return ("Success", (hps.data.sampling_rate, audio_concat))
183
-
184
-
185
- def process_mix(slice):
186
- _speaker = slice.pop()
187
- _text, _lang = [], []
188
- for lang, content in slice:
189
- content = content.split("|")
190
- content = [part for part in content if part != ""]
191
- if len(content) == 0:
192
- continue
193
- if len(_text) == 0:
194
- _text = [[part] for part in content]
195
- _lang = [[lang] for part in content]
196
- else:
197
- _text[-1].append(content[0])
198
- _lang[-1].append(lang)
199
- if len(content) > 1:
200
- _text += [[part] for part in content[1:]]
201
- _lang += [[lang] for part in content[1:]]
202
- return _text, _lang, _speaker
203
-
204
-
205
- def process_auto(text):
206
- _text, _lang = [], []
207
- for slice in text.split("|"):
208
- if slice == "":
209
- continue
210
- temp_text, temp_lang = [], []
211
- sentences_list = split_by_language(slice, target_languages=["zh", "ja", "en"])
212
- for sentence, lang in sentences_list:
213
- if sentence == "":
214
  continue
215
- temp_text.append(sentence)
216
- temp_lang.append(lang.upper())
217
- _text.append(temp_text)
218
- _lang.append(temp_lang)
219
- return _text, _lang
220
-
221
-
222
- def process_text(
223
- text: str,
224
- speaker,
225
- sdp_ratio,
226
- noise_scale,
227
- noise_scale_w,
228
- length_scale,
229
- language,
230
- reference_audio,
231
- emotion,
232
- style_text=None,
233
- style_weight=0,
234
- ):
235
- audio_list = []
236
- if language == "mix":
237
- bool_valid, str_valid = re_matching.validate_text(text)
238
- if not bool_valid:
239
- return str_valid, (
240
- hps.data.sampling_rate,
241
- np.concatenate([np.zeros(hps.data.sampling_rate // 2)]),
242
- )
243
- for slice in re_matching.text_matching(text):
244
- _text, _lang, _speaker = process_mix(slice)
245
- if _speaker is None:
246
  continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
  print(f"Text: {_text}\nLang: {_lang}")
248
  audio_list.extend(
249
  generate_audio_multilang(
@@ -252,293 +295,276 @@ def process_text(
252
  noise_scale,
253
  noise_scale_w,
254
  length_scale,
255
- _speaker,
256
  _lang,
257
  reference_audio,
258
  emotion,
259
  )
260
  )
261
- elif language.lower() == "auto":
262
- _text, _lang = process_auto(text)
263
- print(f"Text: {_text}\nLang: {_lang}")
264
- audio_list.extend(
265
- generate_audio_multilang(
266
- _text,
267
- sdp_ratio,
268
- noise_scale,
269
- noise_scale_w,
270
- length_scale,
271
- speaker,
272
- _lang,
273
- reference_audio,
274
- emotion,
275
- )
276
- )
277
- else:
278
- audio_list.extend(
279
- generate_audio(
280
- text.split("|"),
281
- sdp_ratio,
282
- noise_scale,
283
- noise_scale_w,
284
- length_scale,
285
- speaker,
286
- language,
287
- reference_audio,
288
- emotion,
289
- style_text,
290
- style_weight,
291
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
  )
293
- return audio_list
294
-
295
-
296
- def tts_fn(
297
- text: str,
298
- speaker,
299
- sdp_ratio,
300
- noise_scale,
301
- noise_scale_w,
302
- length_scale,
303
- language,
304
- reference_audio,
305
- emotion,
306
- prompt_mode,
307
- style_text=None,
308
- style_weight=0,
309
- ):
310
-
311
- if style_text == "":
312
- style_text = None
313
- if prompt_mode == "Audio prompt":
314
- if reference_audio == None:
315
- return ("Invalid audio prompt", None)
 
 
 
 
316
  else:
317
- reference_audio = load_audio(reference_audio)[1]
318
- else:
319
- reference_audio = None
320
-
321
- audio_list = process_text(
322
- text,
323
- speaker,
324
- sdp_ratio,
325
- noise_scale,
326
- noise_scale_w,
327
- length_scale,
328
- language,
329
- reference_audio,
330
- emotion,
331
- style_text,
332
- style_weight,
333
- )
334
 
335
- audio_concat = np.concatenate(audio_list)
336
- return "Success", (hps.data.sampling_rate, audio_concat)
337
-
338
-
339
- def format_utils(text, speaker):
340
- _text, _lang = process_auto(text)
341
- res = f"[{speaker}]"
342
- for lang_s, content_s in zip(_lang, _text):
343
- for lang, content in zip(lang_s, content_s):
344
- res += f"<{lang.lower()}>{content}"
345
- res += "|"
346
- return "mix", res[:-1]
347
-
348
-
349
- def load_audio(path):
350
- audio, sr = librosa.load(path, 48000)
351
- # audio = librosa.resample(audio, 44100, 48000)
352
- return sr, audio
353
-
354
-
355
- def gr_util(item):
356
- if item == "Text prompt":
357
- return {"visible": True, "__type__": "update"}, {
358
- "visible": False,
359
- "__type__": "update",
360
- }
361
- else:
362
- return {"visible": False, "__type__": "update"}, {
363
- "visible": True,
364
- "__type__": "update",
365
- }
366
-
367
-
368
- hps = utils.get_hparams_from_file(config.webui_config.config_path)
369
- # 若config.json中未指定版本则默认为最新版本
370
- version = hps.version if hasattr(hps, "version") else latest_version
371
- net_g = get_net_g(
372
- model_path=config.webui_config.model, version=version, device=device, hps=hps
373
- )
374
- speaker_ids = hps.data.spk2id
375
- speakers = list(speaker_ids.keys())
376
- languages = ["ZH", "JP", "EN", "mix", "auto"]
377
-
378
-
379
- def get_advanced_block():
380
- with gr.Blocks() as genshin_local:
381
- gr.Markdown('原作者:https://www.bilibili.com/read/cv26659988/')
382
- with gr.Row():
383
- with gr.Column():
384
- text = gr.TextArea(
385
- label="输入文本内容",
386
- placeholder="""
387
- 如果你选择语言为\'mix\',必须按照格式输入,否则报错:
388
- 格式举例(zh是中文,jp是日语,不区分大小写;说话人举例:gongzi):
389
- [说话人1]<zh>你好,こんにちは! <jp>こんにちは,世界。
390
- [说话人2]<zh>你好吗?<jp>元気ですか?
391
- [说话人3]<zh>谢谢。<jp>どういたしまして。
392
- ...
393
- 另外,所有的语言选项都可以用'|'分割长段实现分句生成。
394
- """,
395
- )
396
- trans = gr.Button("中翻日", variant="primary")
397
- slicer = gr.Button("快速切分", variant="primary")
398
- formatter = gr.Button("检测语言,并整理为 MIX 格式", variant="primary")
399
- speaker = gr.Dropdown(
400
- choices=speakers, value=speakers[0], label="Speaker"
401
- )
402
- _ = gr.Markdown(
403
- value="提示模式(Prompt mode):可选文字提示或音频提示,用于生成文字或音频指定风格的声音。\n",
404
- visible=False,
405
- )
406
- prompt_mode = gr.Radio(
407
- ["Text prompt", "Audio prompt"],
408
- label="Prompt Mode",
409
- value="Text prompt",
410
- visible=False,
411
- )
412
- text_prompt = gr.Textbox(
413
- label="Text prompt",
414
- placeholder="用文字描述生成风格。如:Happy",
415
- value="Happy",
416
- visible=False,
417
- )
418
- audio_prompt = gr.Audio(
419
- label="Audio prompt", type="filepath", visible=False
420
- )
421
- sdp_ratio = gr.Slider(
422
- minimum=0, maximum=1, value=0.5, step=0.1, label="SDP Ratio"
423
- )
424
- noise_scale = gr.Slider(
425
- minimum=0.1, maximum=2, value=0.6, step=0.1, label="Noise"
426
- )
427
- noise_scale_w = gr.Slider(
428
- minimum=0.1, maximum=2, value=0.9, step=0.1, label="Noise_W"
429
- )
430
- length_scale = gr.Slider(
431
- minimum=0.1, maximum=2, value=1.0, step=0.1, label="Length"
432
- )
433
- language = gr.Dropdown(
434
- choices=languages, value=languages[0], label="Language"
435
- )
436
- btn = gr.Button("生成音频!", variant="primary")
437
- with gr.Column():
438
- with gr.Accordion("融合文本语义", open=False):
439
- gr.Markdown(
440
- value="使用辅助文本的语意来辅助生成对话(语言保持与主文本相同)\n\n"
441
- "**注意**:不要使用**指令式文本**(如:开心),要使用**带有强烈情感的文本**(如:我好快乐!!!)\n\n"
442
- "效果较不明确,留空即为不使用该功能"
443
  )
444
- style_text = gr.Textbox(label="辅助文本")
445
- style_weight = gr.Slider(
446
- minimum=0,
447
- maximum=1,
448
- value=0.7,
449
- step=0.1,
450
- label="Weight",
451
- info="主文本和辅助文本的bert混合比率,0表示仅主文本,1表示仅辅助文本",
452
  )
453
- with gr.Row():
454
- with gr.Column():
455
- interval_between_sent = gr.Slider(
456
- minimum=0,
457
- maximum=5,
458
- value=0.2,
459
- step=0.1,
460
- label="句间停顿(秒),勾选按句切分才生效",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
461
  )
462
- interval_between_para = gr.Slider(
 
463
  minimum=0,
464
- maximum=10,
465
- value=1,
466
  step=0.1,
467
- label="段间停顿(秒),需要大于句间停顿才有效",
468
- )
469
- opt_cut_by_sent = gr.Checkbox(
470
- label="按句切分 在按段落切分的基础上再按句子切分文本"
471
  )
472
- slicer = gr.Button("切分生成", variant="primary")
473
- text_output = gr.Textbox(label="状态信息")
474
- audio_output = gr.Audio(label="输出音频")
475
- # explain_image = gr.Image(
476
- # label="参数解释信息",
477
- # show_label=True,
478
- # show_share_button=False,
479
- # show_download_button=False,
480
- # value=os.path.abspath("./img/参数说明.png"),
481
- # )
482
- btn.click(
483
- tts_fn,
484
- inputs=[
485
- text,
486
- speaker,
487
- sdp_ratio,
488
- noise_scale,
489
- noise_scale_w,
490
- length_scale,
491
- language,
492
- audio_prompt,
493
- text_prompt,
494
- prompt_mode,
495
- style_text,
496
- style_weight,
497
- ],
498
- outputs=[text_output, audio_output],
499
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
500
 
501
- trans.click(
502
- translate,
503
- inputs=[text],
504
- outputs=[text],
505
- )
506
- slicer.click(
507
- tts_split,
508
- inputs=[
509
- text,
510
- speaker,
511
- sdp_ratio,
512
- noise_scale,
513
- noise_scale_w,
514
- length_scale,
515
- language,
516
- opt_cut_by_sent,
517
- interval_between_para,
518
- interval_between_sent,
519
- audio_prompt,
520
- text_prompt,
521
- style_text,
522
- style_weight,
523
- ],
524
- outputs=[text_output, audio_output],
525
- )
526
 
527
- prompt_mode.change(
528
- lambda x: gr_util(x),
529
- inputs=[prompt_mode],
530
- outputs=[text_prompt, audio_prompt],
531
- )
532
 
533
- audio_prompt.upload(
534
- lambda x: load_audio(x),
535
- inputs=[audio_prompt],
536
- outputs=[audio_prompt],
537
- )
538
 
539
- formatter.click(
540
- format_utils,
541
- inputs=[text, speaker],
542
- outputs=[language, text],
543
- )
544
- return genshin_local
 
1
  # flake8: noqa: E402
2
  import os
3
  import sys
4
+ from contextlib import contextmanager
5
 
 
 
 
 
 
 
 
 
 
 
 
6
  import gradio as gr
 
 
 
7
  import librosa
8
+ import numpy as np
9
+ import torch
10
 
11
+
12
+ @contextmanager
13
+ def change_dir():
14
+ file_path = os.path.abspath(os.path.dirname(__file__))
15
+ genshin_path = os.path.join(file_path, 'genshin')
16
+
17
+ need_rm = []
18
+ if file_path in sys.path:
19
+ need_rm.append(file_path)
20
+ if genshin_path in sys.path:
21
+ need_rm.append(genshin_path)
22
+
23
+ # 保存当前工作目录
24
+ current_dir = os.getcwd()
25
+ try:
26
+ os.chdir(genshin_path)
27
+ sys.path.append(file_path)
28
+ sys.path.append(genshin_path)
29
+ yield
30
+ finally:
31
+ os.chdir(current_dir)
32
+ for path in need_rm:
33
+ sys.path.remove(path)
34
+
35
+
36
+ # genshin_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'genshin')
37
+ # sys.path.append(os.path.abspath(os.path.dirname(__file__)))
38
+ # sys.path.append(genshin_path)
39
+ # os.chdir(genshin_path)
40
+
41
+ with change_dir():
42
+ import genshin.re_matching as re_matching
43
+ from genshin.tools.sentence import split_by_language
44
+ import genshin.utils as utils
45
+ from genshin.infer import infer, latest_version, get_net_g, infer_multilang
46
+ from genshin.config import config
47
+ from genshin.tools.translate import translate
48
+
49
+ net_g = None
50
+
51
+ device = config.webui_config.device
52
+ if device == "mps":
53
+ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
54
+
55
+
56
+ def generate_audio(
57
+ slices,
58
+ sdp_ratio,
59
+ noise_scale,
60
+ noise_scale_w,
61
+ length_scale,
62
+ speaker,
63
+ language,
64
+ reference_audio,
65
+ emotion,
66
+ style_text,
67
+ style_weight,
68
+ skip_start=False,
69
+ skip_end=False,
70
+ ):
71
+ audio_list = []
72
+ # silence = np.zeros(hps.data.sampling_rate // 2, dtype=np.int16)
73
+ with torch.no_grad():
74
+ for idx, piece in enumerate(slices):
75
+ skip_start = idx != 0
76
+ skip_end = idx != len(slices) - 1
77
+ audio = infer(
78
+ piece,
79
+ reference_audio=reference_audio,
80
+ emotion=emotion,
81
+ sdp_ratio=sdp_ratio,
82
+ noise_scale=noise_scale,
83
+ noise_scale_w=noise_scale_w,
84
+ length_scale=length_scale,
85
+ sid=speaker,
86
+ language=language,
87
+ hps=hps,
88
+ net_g=net_g,
89
+ device=device,
90
+ skip_start=skip_start,
91
+ skip_end=skip_end,
92
+ style_text=style_text,
93
+ style_weight=style_weight,
94
+ )
95
+ audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio)
96
+ audio_list.append(audio16bit)
97
+ return audio_list
98
+
99
+
100
+ def generate_audio_multilang(
101
+ slices,
102
+ sdp_ratio,
103
+ noise_scale,
104
+ noise_scale_w,
105
+ length_scale,
106
+ speaker,
107
+ language,
108
+ reference_audio,
109
+ emotion,
110
+ skip_start=False,
111
+ skip_end=False,
112
+ ):
113
+ audio_list = []
114
+ # silence = np.zeros(hps.data.sampling_rate // 2, dtype=np.int16)
115
+ with torch.no_grad():
116
+ for idx, piece in enumerate(slices):
117
+ skip_start = idx != 0
118
+ skip_end = idx != len(slices) - 1
119
+ audio = infer_multilang(
120
+ piece,
121
+ reference_audio=reference_audio,
122
+ emotion=emotion,
123
+ sdp_ratio=sdp_ratio,
124
+ noise_scale=noise_scale,
125
+ noise_scale_w=noise_scale_w,
126
+ length_scale=length_scale,
127
+ sid=speaker,
128
+ language=language[idx],
129
+ hps=hps,
130
+ net_g=net_g,
131
+ device=device,
132
+ skip_start=skip_start,
133
+ skip_end=skip_end,
134
+ )
135
+ audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio)
136
+ audio_list.append(audio16bit)
137
+ return audio_list
138
+
139
+
140
+ def tts_split(
141
+ text: str,
142
+ speaker,
143
+ sdp_ratio,
144
+ noise_scale,
145
+ noise_scale_w,
146
+ length_scale,
147
+ language,
148
+ cut_by_sent,
149
+ interval_between_para,
150
+ interval_between_sent,
151
+ reference_audio,
152
+ emotion,
153
+ style_text,
154
+ style_weight,
155
+ ):
156
+ while text.find("\n\n") != -1:
157
+ text = text.replace("\n\n", "\n")
158
+ text = text.replace("|", "")
159
+ para_list = re_matching.cut_para(text)
160
+ para_list = [p for p in para_list if p != ""]
161
+ audio_list = []
162
+ for p in para_list:
163
+ if not cut_by_sent:
164
+ audio_list += process_text(
165
+ p,
166
  speaker,
167
  sdp_ratio,
168
  noise_scale,
 
174
  style_text,
175
  style_weight,
176
  )
177
+ silence = np.zeros((int)(44100 * interval_between_para), dtype=np.int16)
178
+ audio_list.append(silence)
179
+ else:
180
+ audio_list_sent = []
181
+ sent_list = re_matching.cut_sent(p)
182
+ sent_list = [s for s in sent_list if s != ""]
183
+ for s in sent_list:
184
+ audio_list_sent += process_text(
185
+ s,
186
+ speaker,
187
+ sdp_ratio,
188
+ noise_scale,
189
+ noise_scale_w,
190
+ length_scale,
191
+ language,
192
+ reference_audio,
193
+ emotion,
194
+ style_text,
195
+ style_weight,
196
+ )
197
+ silence = np.zeros((int)(44100 * interval_between_sent))
198
+ audio_list_sent.append(silence)
199
+ if (interval_between_para - interval_between_sent) > 0:
200
+ silence = np.zeros(
201
+ (int)(44100 * (interval_between_para - interval_between_sent))
202
+ )
203
+ audio_list_sent.append(silence)
204
+ audio16bit = gr.processing_utils.convert_to_16_bit_wav(
205
+ np.concatenate(audio_list_sent)
206
+ ) # 对完整句子做音量归一
207
+ audio_list.append(audio16bit)
208
+ audio_concat = np.concatenate(audio_list)
209
+ return ("Success", (hps.data.sampling_rate, audio_concat))
210
+
211
+
212
+ def process_mix(slice):
213
+ _speaker = slice.pop()
214
+ _text, _lang = [], []
215
+ for lang, content in slice:
216
+ content = content.split("|")
217
+ content = [part for part in content if part != ""]
218
+ if len(content) == 0:
 
 
219
  continue
220
+ if len(_text) == 0:
221
+ _text = [[part] for part in content]
222
+ _lang = [[lang] for part in content]
223
+ else:
224
+ _text[-1].append(content[0])
225
+ _lang[-1].append(lang)
226
+ if len(content) > 1:
227
+ _text += [[part] for part in content[1:]]
228
+ _lang += [[lang] for part in content[1:]]
229
+ return _text, _lang, _speaker
230
+
231
+
232
+ def process_auto(text):
233
+ _text, _lang = [], []
234
+ for slice in text.split("|"):
235
+ if slice == "":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
  continue
237
+ temp_text, temp_lang = [], []
238
+ sentences_list = split_by_language(slice, target_languages=["zh", "ja", "en"])
239
+ for sentence, lang in sentences_list:
240
+ if sentence == "":
241
+ continue
242
+ temp_text.append(sentence)
243
+ temp_lang.append(lang.upper())
244
+ _text.append(temp_text)
245
+ _lang.append(temp_lang)
246
+ return _text, _lang
247
+
248
+
249
+ def process_text(
250
+ text: str,
251
+ speaker,
252
+ sdp_ratio,
253
+ noise_scale,
254
+ noise_scale_w,
255
+ length_scale,
256
+ language,
257
+ reference_audio,
258
+ emotion,
259
+ style_text=None,
260
+ style_weight=0,
261
+ ):
262
+ audio_list = []
263
+ if language == "mix":
264
+ bool_valid, str_valid = re_matching.validate_text(text)
265
+ if not bool_valid:
266
+ return str_valid, (
267
+ hps.data.sampling_rate,
268
+ np.concatenate([np.zeros(hps.data.sampling_rate // 2)]),
269
+ )
270
+ for slice in re_matching.text_matching(text):
271
+ _text, _lang, _speaker = process_mix(slice)
272
+ if _speaker is None:
273
+ continue
274
+ print(f"Text: {_text}\nLang: {_lang}")
275
+ audio_list.extend(
276
+ generate_audio_multilang(
277
+ _text,
278
+ sdp_ratio,
279
+ noise_scale,
280
+ noise_scale_w,
281
+ length_scale,
282
+ _speaker,
283
+ _lang,
284
+ reference_audio,
285
+ emotion,
286
+ )
287
+ )
288
+ elif language.lower() == "auto":
289
+ _text, _lang = process_auto(text)
290
  print(f"Text: {_text}\nLang: {_lang}")
291
  audio_list.extend(
292
  generate_audio_multilang(
 
295
  noise_scale,
296
  noise_scale_w,
297
  length_scale,
298
+ speaker,
299
  _lang,
300
  reference_audio,
301
  emotion,
302
  )
303
  )
304
+ else:
305
+ audio_list.extend(
306
+ generate_audio(
307
+ text.split("|"),
308
+ sdp_ratio,
309
+ noise_scale,
310
+ noise_scale_w,
311
+ length_scale,
312
+ speaker,
313
+ language,
314
+ reference_audio,
315
+ emotion,
316
+ style_text,
317
+ style_weight,
318
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
  )
320
+ return audio_list
321
+
322
+
323
+ def tts_fn(
324
+ text: str,
325
+ speaker,
326
+ sdp_ratio,
327
+ noise_scale,
328
+ noise_scale_w,
329
+ length_scale,
330
+ language,
331
+ reference_audio,
332
+ emotion,
333
+ prompt_mode,
334
+ style_text=None,
335
+ style_weight=0,
336
+ ):
337
+ if style_text == "":
338
+ style_text = None
339
+ if prompt_mode == "Audio prompt":
340
+ if reference_audio == None:
341
+ return ("Invalid audio prompt", None)
342
+ else:
343
+ reference_audio = load_audio(reference_audio)[1]
344
+ else:
345
+ reference_audio = None
346
+
347
+ audio_list = process_text(
348
+ text,
349
+ speaker,
350
+ sdp_ratio,
351
+ noise_scale,
352
+ noise_scale_w,
353
+ length_scale,
354
+ language,
355
+ reference_audio,
356
+ emotion,
357
+ style_text,
358
+ style_weight,
359
  )
360
+
361
+ audio_concat = np.concatenate(audio_list)
362
+ return "Success", (hps.data.sampling_rate, audio_concat)
363
+
364
+
365
+ def format_utils(text, speaker):
366
+ _text, _lang = process_auto(text)
367
+ res = f"[{speaker}]"
368
+ for lang_s, content_s in zip(_lang, _text):
369
+ for lang, content in zip(lang_s, content_s):
370
+ res += f"<{lang.lower()}>{content}"
371
+ res += "|"
372
+ return "mix", res[:-1]
373
+
374
+
375
+ def load_audio(path):
376
+ audio, sr = librosa.load(path, 48000)
377
+ # audio = librosa.resample(audio, 44100, 48000)
378
+ return sr, audio
379
+
380
+
381
+ def gr_util(item):
382
+ if item == "Text prompt":
383
+ return {"visible": True, "__type__": "update"}, {
384
+ "visible": False,
385
+ "__type__": "update",
386
+ }
387
  else:
388
+ return {"visible": False, "__type__": "update"}, {
389
+ "visible": True,
390
+ "__type__": "update",
391
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
392
 
393
+
394
+ hps = utils.get_hparams_from_file(config.webui_config.config_path)
395
+ # 若config.json中未指定版本则默认为最新版本
396
+ version = hps.version if hasattr(hps, "version") else latest_version
397
+ net_g = get_net_g(
398
+ model_path=config.webui_config.model, version=version, device=device, hps=hps
399
+ )
400
+ speaker_ids = hps.data.spk2id
401
+ speakers = list(speaker_ids.keys())
402
+ languages = ["ZH", "JP", "EN", "mix", "auto"]
403
+
404
+
405
+ def get_advanced_block():
406
+ with gr.Blocks() as genshin_local:
407
+ gr.Markdown('原作者:https://www.bilibili.com/read/cv26659988/')
408
+ with gr.Row():
409
+ with gr.Column():
410
+ text = gr.TextArea(
411
+ label="输入文本内容",
412
+ placeholder="""
413
+ 如果你选择语言为\'mix\',必须按照格式输入,否则报错:
414
+ 格式举例(zh是中文,jp是日语,不区分大小写;说话人举例:gongzi):
415
+ [说话人1]<zh>你好,こんにちは! <jp>こんにちは,世界。
416
+ [说话人2]<zh>你好吗?<jp>元気ですか?
417
+ [说话人3]<zh>谢谢。<jp>どういたしまして。
418
+ ...
419
+ 另外,所有的语言选项都可以用'|'分割长段实现分句生成。
420
+ """,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421
  )
422
+ trans = gr.Button("中翻日", variant="primary")
423
+ slicer = gr.Button("快速切分", variant="primary")
424
+ formatter = gr.Button("检测语言,并整理为 MIX 格式", variant="primary")
425
+ speaker = gr.Dropdown(
426
+ choices=speakers, value=speakers[0], label="Speaker"
 
 
 
427
  )
428
+ _ = gr.Markdown(
429
+ value="提示模式(Prompt mode):可选文字提示或音频提示,用于生成文字或音频指定风格的声音。\n",
430
+ visible=False,
431
+ )
432
+ prompt_mode = gr.Radio(
433
+ ["Text prompt", "Audio prompt"],
434
+ label="Prompt Mode",
435
+ value="Text prompt",
436
+ visible=False,
437
+ )
438
+ text_prompt = gr.Textbox(
439
+ label="Text prompt",
440
+ placeholder="用文字描述生成风格。如:Happy",
441
+ value="Happy",
442
+ visible=False,
443
+ )
444
+ audio_prompt = gr.Audio(
445
+ label="Audio prompt", type="filepath", visible=False
446
+ )
447
+ sdp_ratio = gr.Slider(
448
+ minimum=0, maximum=1, value=0.5, step=0.1, label="SDP Ratio"
449
+ )
450
+ noise_scale = gr.Slider(
451
+ minimum=0.1, maximum=2, value=0.6, step=0.1, label="Noise"
452
+ )
453
+ noise_scale_w = gr.Slider(
454
+ minimum=0.1, maximum=2, value=0.9, step=0.1, label="Noise_W"
455
+ )
456
+ length_scale = gr.Slider(
457
+ minimum=0.1, maximum=2, value=1.0, step=0.1, label="Length"
458
+ )
459
+ language = gr.Dropdown(
460
+ choices=languages, value=languages[0], label="Language"
461
+ )
462
+ btn = gr.Button("生成音频!", variant="primary")
463
+ with gr.Column():
464
+ with gr.Accordion("融合文本语义", open=False):
465
+ gr.Markdown(
466
+ value="使用辅助文本的语意来辅助生成对话(语言保持与主文本相同)\n\n"
467
+ "**注意**:不要使用**指令式文本**(如:开心),要使用**带有强烈情感的文本**(如:我好快乐!!!)\n\n"
468
+ "效果较不明确,留空即为不使用该功能"
469
  )
470
+ style_text = gr.Textbox(label="辅助文本")
471
+ style_weight = gr.Slider(
472
  minimum=0,
473
+ maximum=1,
474
+ value=0.7,
475
  step=0.1,
476
+ label="Weight",
477
+ info="主文本和辅助文本的bert混合比率,0表示仅主文本,1表示仅辅助文本",
 
 
478
  )
479
+ with gr.Row():
480
+ with gr.Column():
481
+ interval_between_sent = gr.Slider(
482
+ minimum=0,
483
+ maximum=5,
484
+ value=0.2,
485
+ step=0.1,
486
+ label="句间停顿(秒),勾选按句切分才生效",
487
+ )
488
+ interval_between_para = gr.Slider(
489
+ minimum=0,
490
+ maximum=10,
491
+ value=1,
492
+ step=0.1,
493
+ label="段间停顿(秒),需要大于句间停顿才有效",
494
+ )
495
+ opt_cut_by_sent = gr.Checkbox(
496
+ label="按句切分 在按段落切分的基础上再按句子切分文本"
497
+ )
498
+ slicer = gr.Button("切分生成", variant="primary")
499
+ text_output = gr.Textbox(label="状态信息")
500
+ audio_output = gr.Audio(label="输出音频")
501
+ # explain_image = gr.Image(
502
+ # label="参数解释信息",
503
+ # show_label=True,
504
+ # show_share_button=False,
505
+ # show_download_button=False,
506
+ # value=os.path.abspath("./img/参数说明.png"),
507
+ # )
508
+ btn.click(
509
+ tts_fn,
510
+ inputs=[
511
+ text,
512
+ speaker,
513
+ sdp_ratio,
514
+ noise_scale,
515
+ noise_scale_w,
516
+ length_scale,
517
+ language,
518
+ audio_prompt,
519
+ text_prompt,
520
+ prompt_mode,
521
+ style_text,
522
+ style_weight,
523
+ ],
524
+ outputs=[text_output, audio_output],
525
+ )
526
 
527
+ trans.click(
528
+ translate,
529
+ inputs=[text],
530
+ outputs=[text],
531
+ )
532
+ slicer.click(
533
+ tts_split,
534
+ inputs=[
535
+ text,
536
+ speaker,
537
+ sdp_ratio,
538
+ noise_scale,
539
+ noise_scale_w,
540
+ length_scale,
541
+ language,
542
+ opt_cut_by_sent,
543
+ interval_between_para,
544
+ interval_between_sent,
545
+ audio_prompt,
546
+ text_prompt,
547
+ style_text,
548
+ style_weight,
549
+ ],
550
+ outputs=[text_output, audio_output],
551
+ )
552
 
553
+ prompt_mode.change(
554
+ lambda x: gr_util(x),
555
+ inputs=[prompt_mode],
556
+ outputs=[text_prompt, audio_prompt],
557
+ )
558
 
559
+ audio_prompt.upload(
560
+ lambda x: load_audio(x),
561
+ inputs=[audio_prompt],
562
+ outputs=[audio_prompt],
563
+ )
564
 
565
+ formatter.click(
566
+ format_utils,
567
+ inputs=[text, speaker],
568
+ outputs=[language, text],
569
+ )
570
+ return genshin_local
TTSs/genshin_local/genshin_local_tts.py CHANGED
@@ -1,27 +1,33 @@
1
  import io
 
2
  import os
 
3
 
4
  import gradio as gr
5
  import scipy.io.wavfile as wavfile
6
  from pydub import AudioSegment
7
 
8
- import importlib
9
- import logging
10
-
11
  from TTSs.base_tts import Base_TTS
12
 
13
 
14
-
15
  class genshin_local_TTS(Base_TTS):
16
-
17
  def __init__(self):
18
  if self.is_show():
19
  try:
20
- self.speakers_genshin_local = importlib.import_module('TTSs.genshin_bg').speakers
21
- self.languages = importlib.import_module('TTSs.genshin_bg').languages
22
- self.genshin_tts_fn = importlib.import_module('TTSs.genshin_bg').tts_fn
 
 
 
 
 
 
 
23
  logging.info('导入原神本地语音合成模块成功')
24
  except Exception as e:
 
 
25
  logging.error('导入原神本地语音合成模块失败')
26
  logging.error(e)
27
  else:
@@ -95,16 +101,18 @@ class genshin_local_TTS(Base_TTS):
95
  noise_scale_w_local,
96
  length_scale_local,
97
  language_local):
98
- ori_audio_data = self.genshin_tts_fn(text, speaker_local,
99
- sdp_ratio_local,
100
- noise_scale_local,
101
- noise_scale_w_local,
102
- length_scale_local,
103
- language_local,
104
- None, 'Happy',
105
- 'Text prompt',
106
- 'style_text',
107
- 0.7)[1]
 
 
108
 
109
  wav_io = io.BytesIO()
110
  wavfile.write(wav_io, ori_audio_data[0], ori_audio_data[1])
@@ -112,11 +120,10 @@ class genshin_local_TTS(Base_TTS):
112
  original_audio = AudioSegment.from_wav(wav_io)
113
  return original_audio
114
 
115
-
116
  def search_speaker(self, search_value):
117
  for s in self.speakers_genshin_local:
118
  if search_value == s:
119
  return s
120
  for s in self.speakers_genshin_local:
121
  if search_value in s:
122
- return s
 
1
  import io
2
+ import logging
3
  import os
4
+ import traceback
5
 
6
  import gradio as gr
7
  import scipy.io.wavfile as wavfile
8
  from pydub import AudioSegment
9
 
 
 
 
10
  from TTSs.base_tts import Base_TTS
11
 
12
 
 
13
  class genshin_local_TTS(Base_TTS):
 
14
  def __init__(self):
15
  if self.is_show():
16
  try:
17
+ from .genshin_bg import speakers
18
+ from .genshin_bg import languages
19
+ from .genshin_bg import tts_fn
20
+ from .genshin_bg import change_dir
21
+
22
+ self.speakers_genshin_local = speakers
23
+ self.languages = languages
24
+ self.genshin_tts_fn = tts_fn
25
+ self.change_dir = change_dir
26
+
27
  logging.info('导入原神本地语音合成模块成功')
28
  except Exception as e:
29
+ traceback.print_exc()
30
+
31
  logging.error('导入原神本地语音合成模块失败')
32
  logging.error(e)
33
  else:
 
101
  noise_scale_w_local,
102
  length_scale_local,
103
  language_local):
104
+
105
+ with self.change_dir():
106
+ ori_audio_data = self.genshin_tts_fn(text, speaker_local,
107
+ sdp_ratio_local,
108
+ noise_scale_local,
109
+ noise_scale_w_local,
110
+ length_scale_local,
111
+ language_local,
112
+ None, 'Happy',
113
+ 'Text prompt',
114
+ 'style_text',
115
+ 0.7)[1]
116
 
117
  wav_io = io.BytesIO()
118
  wavfile.write(wav_io, ori_audio_data[0], ori_audio_data[1])
 
120
  original_audio = AudioSegment.from_wav(wav_io)
121
  return original_audio
122
 
 
123
  def search_speaker(self, search_value):
124
  for s in self.speakers_genshin_local:
125
  if search_value == s:
126
  return s
127
  for s in self.speakers_genshin_local:
128
  if search_value in s:
129
+ return s