XzJosh commited on
Commit
8fbdb1c
1 Parent(s): 475e23d

Upload 5 files

Browse files
Files changed (5) hide show
  1. Data/Taffy/config.json +97 -0
  2. Data/Taffy/models/G_7600.pth +3 -0
  3. app.py +161 -31
  4. config.yml +21 -12
  5. infer.py +90 -0
Data/Taffy/config.json ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 100,
4
+ "eval_interval": 100,
5
+ "seed": 42,
6
+ "epochs": 10000,
7
+ "learning_rate": 0.0002,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 12,
14
+ "fp16_run": false,
15
+ "lr_decay": 0.99995,
16
+ "segment_size": 16384,
17
+ "init_lr_ratio": 1,
18
+ "warmup_epochs": 0,
19
+ "c_mel": 45,
20
+ "c_kl": 1.0,
21
+ "skip_optimizer": true,
22
+ "keep_ckpts": 30
23
+ },
24
+ "data": {
25
+ "training_files": "Data/Taffy/filelists/train.list",
26
+ "validation_files": "Data/Taffy/filelists/val.list",
27
+ "max_wav_value": 32768.0,
28
+ "sampling_rate": 44100,
29
+ "filter_length": 2048,
30
+ "hop_length": 512,
31
+ "win_length": 2048,
32
+ "n_mel_channels": 128,
33
+ "mel_fmin": 0.0,
34
+ "mel_fmax": null,
35
+ "add_blank": true,
36
+ "n_speakers": 700,
37
+ "cleaned_text": true,
38
+ "spk2id": {
39
+ "永雏塔菲": 0
40
+ }
41
+ },
42
+ "model": {
43
+ "use_spk_conditioned_encoder": true,
44
+ "use_noise_scaled_mas": true,
45
+ "use_mel_posterior_encoder": false,
46
+ "use_duration_discriminator": true,
47
+ "inter_channels": 192,
48
+ "hidden_channels": 192,
49
+ "filter_channels": 768,
50
+ "n_heads": 2,
51
+ "n_layers": 6,
52
+ "kernel_size": 3,
53
+ "p_dropout": 0.1,
54
+ "resblock": "1",
55
+ "resblock_kernel_sizes": [
56
+ 3,
57
+ 7,
58
+ 11
59
+ ],
60
+ "resblock_dilation_sizes": [
61
+ [
62
+ 1,
63
+ 3,
64
+ 5
65
+ ],
66
+ [
67
+ 1,
68
+ 3,
69
+ 5
70
+ ],
71
+ [
72
+ 1,
73
+ 3,
74
+ 5
75
+ ]
76
+ ],
77
+ "upsample_rates": [
78
+ 8,
79
+ 8,
80
+ 2,
81
+ 2,
82
+ 2
83
+ ],
84
+ "upsample_initial_channel": 512,
85
+ "upsample_kernel_sizes": [
86
+ 16,
87
+ 16,
88
+ 8,
89
+ 2,
90
+ 2
91
+ ],
92
+ "n_layers_q": 3,
93
+ "use_spectral_norm": false,
94
+ "gin_channels": 256
95
+ },
96
+ "version": "2.0"
97
+ }
Data/Taffy/models/G_7600.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88ebc1c92d7981f45325106cc40b81524f38585082e1072c2118ed72a5a2c93f
3
+ size 705938526
app.py CHANGED
@@ -3,7 +3,7 @@ import os
3
  import logging
4
 
5
  import re_matching
6
- from tools.sentence import split_by_language, sentence_split
7
 
8
  logging.getLogger("numba").setLevel(logging.WARNING)
9
  logging.getLogger("markdown_it").setLevel(logging.WARNING)
@@ -17,16 +17,13 @@ logging.basicConfig(
17
  logger = logging.getLogger(__name__)
18
 
19
  import torch
20
- import ssl
21
- ssl._create_default_https_context = ssl._create_unverified_context
22
- import nltk
23
- nltk.download('cmudict')
24
  import utils
25
- from infer import infer, latest_version, get_net_g
26
  import gradio as gr
27
  import webbrowser
28
  import numpy as np
29
  from config import config
 
30
 
31
  net_g = None
32
 
@@ -43,11 +40,15 @@ def generate_audio(
43
  length_scale,
44
  speaker,
45
  language,
 
 
46
  ):
47
  audio_list = []
48
- silence = np.zeros(hps.data.sampling_rate // 2, dtype=np.int16)
49
  with torch.no_grad():
50
- for piece in slices:
 
 
51
  audio = infer(
52
  piece,
53
  sdp_ratio=sdp_ratio,
@@ -59,10 +60,49 @@ def generate_audio(
59
  hps=hps,
60
  net_g=net_g,
61
  device=device,
 
 
62
  )
63
  audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio)
64
  audio_list.append(audio16bit)
65
- audio_list.append(silence) # 将静音添加到列表中
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  return audio_list
67
 
68
 
@@ -85,7 +125,9 @@ def tts_split(
85
  para_list = re_matching.cut_para(text)
86
  audio_list = []
87
  if not cut_by_sent:
88
- for p in para_list:
 
 
89
  audio = infer(
90
  p,
91
  sdp_ratio=sdp_ratio,
@@ -97,16 +139,22 @@ def tts_split(
97
  hps=hps,
98
  net_g=net_g,
99
  device=device,
 
 
100
  )
101
  audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio)
102
  audio_list.append(audio16bit)
103
  silence = np.zeros((int)(44100 * interval_between_para), dtype=np.int16)
104
  audio_list.append(silence)
105
  else:
106
- for p in para_list:
 
 
107
  audio_list_sent = []
108
  sent_list = re_matching.cut_sent(p)
109
- for s in sent_list:
 
 
110
  audio = infer(
111
  s,
112
  sdp_ratio=sdp_ratio,
@@ -118,6 +166,8 @@ def tts_split(
118
  hps=hps,
119
  net_g=net_g,
120
  device=device,
 
 
121
  )
122
  audio_list_sent.append(audio)
123
  silence = np.zeros((int)(44100 * interval_between_sent))
@@ -152,40 +202,116 @@ def tts_fn(
152
  hps.data.sampling_rate,
153
  np.concatenate([np.zeros(hps.data.sampling_rate // 2)]),
154
  )
155
- result = re_matching.text_matching(text)
156
- for one in result:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  _speaker = one.pop()
158
- for lang, content in one:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  audio_list.extend(
160
- generate_audio(
161
- content.split("|"),
162
  sdp_ratio,
163
  noise_scale,
164
  noise_scale_w,
165
  length_scale,
166
  _speaker,
167
- lang,
 
 
168
  )
169
  )
 
170
  elif language.lower() == "auto":
171
- sentences_list = split_by_language(text, target_languages=["zh", "ja", "en"])
172
- for sentences, lang in sentences_list:
173
- lang = lang.upper()
174
- if lang == "JA":
175
- lang = "JP"
176
- sentences = sentence_split(sentences, max=250)
177
- for content in sentences:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  audio_list.extend(
179
- generate_audio(
180
- content.split("|"),
181
  sdp_ratio,
182
  noise_scale,
183
  noise_scale_w,
184
  length_scale,
185
  speaker,
186
- lang,
 
 
187
  )
188
  )
 
189
  else:
190
  audio_list.extend(
191
  generate_audio(
@@ -220,10 +346,9 @@ if __name__ == "__main__":
220
  with gr.Row():
221
  with gr.Column():
222
  gr.Markdown(value="""
223
- 【AI星瞳①】在线语音合成(Bert-Vits2 2.0中日英)\n
224
  作者:Xz乔希 https://space.bilibili.com/5859321\n
225
- 声音归属:星瞳_Official https://space.bilibili.com/401315430\n
226
- 【AI星瞳②】https://huggingface.co/spaces/XzJosh/Star-Bert-VITS2\n
227
  【AI合集】https://www.modelscope.cn/studios/xzjosh/Bert-VITS2\n
228
  Bert-VITS2项目:https://github.com/Stardust-minus/Bert-VITS2\n
229
  使用本模型请严格遵守法律法规!\n
@@ -304,6 +429,11 @@ if __name__ == "__main__":
304
  outputs=[text_output, audio_output],
305
  )
306
 
 
 
 
 
 
307
  slicer.click(
308
  tts_split,
309
  inputs=[
 
3
  import logging
4
 
5
  import re_matching
6
+ from tools.sentence import split_by_language
7
 
8
  logging.getLogger("numba").setLevel(logging.WARNING)
9
  logging.getLogger("markdown_it").setLevel(logging.WARNING)
 
17
  logger = logging.getLogger(__name__)
18
 
19
  import torch
 
 
 
 
20
  import utils
21
+ from infer import infer, latest_version, get_net_g, infer_multilang
22
  import gradio as gr
23
  import webbrowser
24
  import numpy as np
25
  from config import config
26
+ from tools.translate import translate
27
 
28
  net_g = None
29
 
 
40
  length_scale,
41
  speaker,
42
  language,
43
+ skip_start=False,
44
+ skip_end=False,
45
  ):
46
  audio_list = []
47
+ # silence = np.zeros(hps.data.sampling_rate // 2, dtype=np.int16)
48
  with torch.no_grad():
49
+ for idx, piece in enumerate(slices):
50
+ skip_start = (idx != 0) and skip_start
51
+ skip_end = (idx != len(slices) - 1) and skip_end
52
  audio = infer(
53
  piece,
54
  sdp_ratio=sdp_ratio,
 
60
  hps=hps,
61
  net_g=net_g,
62
  device=device,
63
+ skip_start=skip_start,
64
+ skip_end=skip_end,
65
  )
66
  audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio)
67
  audio_list.append(audio16bit)
68
+ # audio_list.append(silence) # 将静音添加到列表中
69
+ return audio_list
70
+
71
+
72
+ def generate_audio_multilang(
73
+ slices,
74
+ sdp_ratio,
75
+ noise_scale,
76
+ noise_scale_w,
77
+ length_scale,
78
+ speaker,
79
+ language,
80
+ skip_start=False,
81
+ skip_end=False,
82
+ ):
83
+ audio_list = []
84
+ # silence = np.zeros(hps.data.sampling_rate // 2, dtype=np.int16)
85
+ with torch.no_grad():
86
+ for idx, piece in enumerate(slices):
87
+ skip_start = (idx != 0) and skip_start
88
+ skip_end = (idx != len(slices) - 1) and skip_end
89
+ audio = infer_multilang(
90
+ piece,
91
+ sdp_ratio=sdp_ratio,
92
+ noise_scale=noise_scale,
93
+ noise_scale_w=noise_scale_w,
94
+ length_scale=length_scale,
95
+ sid=speaker,
96
+ language=language[idx],
97
+ hps=hps,
98
+ net_g=net_g,
99
+ device=device,
100
+ skip_start=skip_start,
101
+ skip_end=skip_end,
102
+ )
103
+ audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio)
104
+ audio_list.append(audio16bit)
105
+ # audio_list.append(silence) # 将静音添加到列表中
106
  return audio_list
107
 
108
 
 
125
  para_list = re_matching.cut_para(text)
126
  audio_list = []
127
  if not cut_by_sent:
128
+ for idx, p in enumerate(para_list):
129
+ skip_start = idx != 0
130
+ skip_end = idx != len(para_list) - 1
131
  audio = infer(
132
  p,
133
  sdp_ratio=sdp_ratio,
 
139
  hps=hps,
140
  net_g=net_g,
141
  device=device,
142
+ skip_start=skip_start,
143
+ skip_end=skip_end,
144
  )
145
  audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio)
146
  audio_list.append(audio16bit)
147
  silence = np.zeros((int)(44100 * interval_between_para), dtype=np.int16)
148
  audio_list.append(silence)
149
  else:
150
+ for idx, p in enumerate(para_list):
151
+ skip_start = idx != 0
152
+ skip_end = idx != len(para_list) - 1
153
  audio_list_sent = []
154
  sent_list = re_matching.cut_sent(p)
155
+ for idx, s in enumerate(sent_list):
156
+ skip_start = (idx != 0) and skip_start
157
+ skip_end = (idx != len(sent_list) - 1) and skip_end
158
  audio = infer(
159
  s,
160
  sdp_ratio=sdp_ratio,
 
166
  hps=hps,
167
  net_g=net_g,
168
  device=device,
169
+ skip_start=skip_start,
170
+ skip_end=skip_end,
171
  )
172
  audio_list_sent.append(audio)
173
  silence = np.zeros((int)(44100 * interval_between_sent))
 
202
  hps.data.sampling_rate,
203
  np.concatenate([np.zeros(hps.data.sampling_rate // 2)]),
204
  )
205
+ result = []
206
+ for slice in re_matching.text_matching(text):
207
+ _speaker = slice.pop()
208
+ temp_contant = []
209
+ temp_lang = []
210
+ for lang, content in slice:
211
+ if "|" in content:
212
+ temp = []
213
+ temp_ = []
214
+ for i in content.split("|"):
215
+ if i != "":
216
+ temp.append([i])
217
+ temp_.append([lang])
218
+ else:
219
+ temp.append([])
220
+ temp_.append([])
221
+ temp_contant += temp
222
+ temp_lang += temp_
223
+ else:
224
+ if len(temp_contant) == 0:
225
+ temp_contant.append([])
226
+ temp_lang.append([])
227
+ temp_contant[-1].append(content)
228
+ temp_lang[-1].append(lang)
229
+ for i, j in zip(temp_lang, temp_contant):
230
+ result.append([*zip(i, j), _speaker])
231
+ for i, one in enumerate(result):
232
+ skip_start = i != 0
233
+ skip_end = i != len(result) - 1
234
  _speaker = one.pop()
235
+ idx = 0
236
+ while idx < len(one):
237
+ text_to_generate = []
238
+ lang_to_generate = []
239
+ while True:
240
+ lang, content = one[idx]
241
+ temp_text = [content]
242
+ if len(text_to_generate) > 0:
243
+ text_to_generate[-1] += [temp_text.pop(0)]
244
+ lang_to_generate[-1] += [lang]
245
+ if len(temp_text) > 0:
246
+ text_to_generate += [[i] for i in temp_text]
247
+ lang_to_generate += [[lang]] * len(temp_text)
248
+ if idx + 1 < len(one):
249
+ idx += 1
250
+ else:
251
+ break
252
+ skip_start = (idx != 0) and skip_start
253
+ skip_end = (idx != len(one) - 1) and skip_end
254
+ print(text_to_generate, lang_to_generate)
255
  audio_list.extend(
256
+ generate_audio_multilang(
257
+ text_to_generate,
258
  sdp_ratio,
259
  noise_scale,
260
  noise_scale_w,
261
  length_scale,
262
  _speaker,
263
+ lang_to_generate,
264
+ skip_start,
265
+ skip_end,
266
  )
267
  )
268
+ idx += 1
269
  elif language.lower() == "auto":
270
+ for idx, slice in enumerate(text.split("|")):
271
+ if slice == "":
272
+ continue
273
+ skip_start = idx != 0
274
+ skip_end = idx != len(text.split("|")) - 1
275
+ sentences_list = split_by_language(
276
+ slice, target_languages=["zh", "ja", "en"]
277
+ )
278
+ idx = 0
279
+ while idx < len(sentences_list):
280
+ text_to_generate = []
281
+ lang_to_generate = []
282
+ while True:
283
+ content, lang = sentences_list[idx]
284
+ temp_text = [content]
285
+ lang = lang.upper()
286
+ if lang == "JA":
287
+ lang = "JP"
288
+ if len(text_to_generate) > 0:
289
+ text_to_generate[-1] += [temp_text.pop(0)]
290
+ lang_to_generate[-1] += [lang]
291
+ if len(temp_text) > 0:
292
+ text_to_generate += [[i] for i in temp_text]
293
+ lang_to_generate += [[lang]] * len(temp_text)
294
+ if idx + 1 < len(sentences_list):
295
+ idx += 1
296
+ else:
297
+ break
298
+ skip_start = (idx != 0) and skip_start
299
+ skip_end = (idx != len(sentences_list) - 1) and skip_end
300
+ print(text_to_generate, lang_to_generate)
301
  audio_list.extend(
302
+ generate_audio_multilang(
303
+ text_to_generate,
304
  sdp_ratio,
305
  noise_scale,
306
  noise_scale_w,
307
  length_scale,
308
  speaker,
309
+ lang_to_generate,
310
+ skip_start,
311
+ skip_end,
312
  )
313
  )
314
+ idx += 1
315
  else:
316
  audio_list.extend(
317
  generate_audio(
 
346
  with gr.Row():
347
  with gr.Column():
348
  gr.Markdown(value="""
349
+ 【AI塔菲】在线语音合成(Bert-Vits2 2.0中日英)\n
350
  作者:Xz乔希 https://space.bilibili.com/5859321\n
351
+ 声音归属:永雏塔菲 https://space.bilibili.com/1265680561\n
 
352
  【AI合集】https://www.modelscope.cn/studios/xzjosh/Bert-VITS2\n
353
  Bert-VITS2项目:https://github.com/Stardust-minus/Bert-VITS2\n
354
  使用本模型请严格遵守法律法规!\n
 
429
  outputs=[text_output, audio_output],
430
  )
431
 
432
+ trans.click(
433
+ translate,
434
+ inputs=[text],
435
+ outputs=[text],
436
+ )
437
  slicer.click(
438
  tts_split,
439
  inputs=[
config.yml CHANGED
@@ -4,10 +4,10 @@
4
  # 拟提供通用路径配置,统一存放数据,避免数据放得很乱
5
  # 每个数据集与其对应的模型存放至统一路径下,后续所有的路径配置均为相对于datasetPath的路径
6
  # 不填或者填空则路径为相对于项目根目录的路径
7
- dataset_path: "Data/XingTong"
8
 
9
  # 模型镜像源,默认huggingface,使用openi镜像源需指定openi_token
10
- mirror: "openi"
11
  openi_token: "" # openi token
12
 
13
  # resample 音频重采样配置
@@ -26,7 +26,7 @@ resample:
26
  # 注意, “:” 后需要加空格
27
  preprocess_text:
28
  # 原始文本文件路径,文本格式应为{wav_path}|{speaker_name}|{language}|{text}。
29
- transcription_path: "filelists/XingTong.list"
30
  # 数据清洗后文本路径,可以不填。不填则将在原始文本目录生成
31
  cleaned_path: ""
32
  # 训练集路径
@@ -36,7 +36,7 @@ preprocess_text:
36
  # 配置文件路径
37
  config_path: "config.json"
38
  # 每个speaker的验证集条数
39
- val_per_spk: 5
40
  # 验证集最大条数,多于的会被截断并放到训练集中
41
  max_val_total: 8
42
  # 是否进行数据清洗
@@ -68,12 +68,12 @@ train_ms:
68
  WORLD_SIZE: 1
69
  RANK: 0
70
  # 可以填写任意名的环境变量
71
- THE_ENV_VAR_YOU_NEED_TO_USE: "1234567"
72
  # 底模设置
73
  base:
74
  use_base_model: false
75
  repo_id: "Stardust_minus/Bert-VITS2"
76
- model_image: "Bert-VITS2中日底模" # openi网页的模型名
77
  # 训练模型存储目录:与旧版本的区别,原先数据集是存放在logs/model_name下的,现在改为统一存放在Data/你的数据集/models下
78
  model: "models"
79
  # 配置文件路径
@@ -84,9 +84,9 @@ train_ms:
84
  # 注意, “:” 后需要加空格
85
  webui:
86
  # 推理设备
87
- device: "cpu"
88
  # 模型路径
89
- model: "models/G_8000.pth"
90
  # 配置文件路径
91
  config_path: "config.json"
92
  # 端口号
@@ -111,9 +111,9 @@ server:
111
  # 注意,所有模型都必须正确配置model与config的路径,空路径会导致加载错误。
112
  models:
113
  - # 模型的路径
114
- model: "Data/XingTong/models/G_8000.pth"
115
  # 模型config.json的路径
116
- config: "Data/XingTong/config.json"
117
  # 模型使用设备,若填写则会覆盖默认配置
118
  device: "cuda"
119
  # 模型默认使用的语言
@@ -138,9 +138,9 @@ server:
138
  noise_scale_w: 0.8
139
  length_scale: 1.2
140
  - # 模型的路径
141
- model: "Data/XingTong/models/G_8000.pth"
142
  # 模型config.json的路径
143
- config: "Data/XingTong/config.json"
144
  # 模型使用设备,若填写则会覆盖默认配置
145
  device: "cpu"
146
  # 模型默认使用的语言
@@ -149,3 +149,12 @@ server:
149
  # 不必填写所有人物,不填的使用默认值
150
  speakers: [ ] # 也可以不填
151
 
 
 
 
 
 
 
 
 
 
 
4
  # 拟提供通用路径配置,统一存放数据,避免数据放得很乱
5
  # 每个数据集与其对应的模型存放至统一路径下,后续所有的路径配置均为相对于datasetPath的路径
6
  # 不填或者填空则路径为相对于项目根目录的路径
7
+ dataset_path: "Data/Taffy"
8
 
9
  # 模型镜像源,默认huggingface,使用openi镜像源需指定openi_token
10
+ mirror: ""
11
  openi_token: "" # openi token
12
 
13
  # resample 音频重采样配置
 
26
  # 注意, “:” 后需要加空格
27
  preprocess_text:
28
  # 原始文本文件路径,文本格式应为{wav_path}|{speaker_name}|{language}|{text}。
29
+ transcription_path: "filelists/Taffy.list"
30
  # 数据清洗后文本路径,可以不填。不填则将在原始文本目录生成
31
  cleaned_path: ""
32
  # 训练集路径
 
36
  # 配置文件路径
37
  config_path: "config.json"
38
  # 每个speaker的验证集条数
39
+ val_per_spk: 4
40
  # 验证集最大条数,多于的会被截断并放到训练集中
41
  max_val_total: 8
42
  # 是否进行数据清洗
 
68
  WORLD_SIZE: 1
69
  RANK: 0
70
  # 可以填写任意名的环境变量
71
+ # THE_ENV_VAR_YOU_NEED_TO_USE: "1234567"
72
  # 底模设置
73
  base:
74
  use_base_model: false
75
  repo_id: "Stardust_minus/Bert-VITS2"
76
+ model_image: "Bert-VITS2中日英底模-fix" # openi网页的模型名
77
  # 训练模型存储目录:与旧版本的区别,原先数据集是存放在logs/model_name下的,现在改为统一存放在Data/你的数据集/models下
78
  model: "models"
79
  # 配置文件路径
 
84
  # 注意, “:” 后需要加空格
85
  webui:
86
  # 推理设备
87
+ device: "cuda"
88
  # 模型路径
89
+ model: "models/G_7600.pth"
90
  # 配置文件路径
91
  config_path: "config.json"
92
  # 端口号
 
111
  # 注意,所有模型都必须正确配置model与config的路径,空路径会导致加载错误。
112
  models:
113
  - # 模型的路径
114
+ model: "Data/Taffy/models/G_8000.pth"
115
  # 模型config.json的路径
116
+ config: "Data/Taffy/config.json"
117
  # 模型使用设备,若填写则会覆盖默认配置
118
  device: "cuda"
119
  # 模型默认使用的语言
 
138
  noise_scale_w: 0.8
139
  length_scale: 1.2
140
  - # 模型的路径
141
+ model: "Data/Taffy/models/G_8000.pth"
142
  # 模型config.json的路径
143
+ config: "Data/Taffy/config.json"
144
  # 模型使用设备,若填写则会覆盖默认配置
145
  device: "cpu"
146
  # 模型默认使用的语言
 
149
  # 不必填写所有人物,不填的使用默认值
150
  speakers: [ ] # 也可以不填
151
 
152
+
153
+ # 百度翻译开放平台 api配置
154
+ # api接入文档 https://api.fanyi.baidu.com/doc/21
155
+ # 请不要在github等网站公开分享你的app id 与 key
156
+ translate:
157
+ # 你的APPID
158
+ "app_key": ""
159
+ # 你的密钥
160
+ "secret_key": ""
infer.py CHANGED
@@ -204,3 +204,93 @@ def infer(
204
  del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers
205
  torch.cuda.empty_cache()
206
  return audio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers
205
  torch.cuda.empty_cache()
206
  return audio
207
+
208
+
209
+ def infer_multilang(
210
+ text,
211
+ sdp_ratio,
212
+ noise_scale,
213
+ noise_scale_w,
214
+ length_scale,
215
+ sid,
216
+ language,
217
+ hps,
218
+ net_g,
219
+ device,
220
+ skip_start=False,
221
+ skip_end=False,
222
+ ):
223
+ bert, ja_bert, en_bert, phones, tones, lang_ids = [], [], [], [], [], []
224
+ # bert, ja_bert, en_bert, phones, tones, lang_ids = get_text(
225
+ # text, language, hps, device
226
+ # )
227
+ for idx, (t, l) in enumerate(zip(text, language)):
228
+ skip_start = (idx != 0) or (skip_start and idx == 0)
229
+ skip_end = (idx != len(text) - 1) or (skip_end and idx == len(text) - 1)
230
+ (
231
+ temp_bert,
232
+ temp_ja_bert,
233
+ temp_en_bert,
234
+ temp_phones,
235
+ temp_tones,
236
+ temp_lang_ids,
237
+ ) = get_text(t, l, hps, device)
238
+ if skip_start:
239
+ temp_bert = temp_bert[:, 1:]
240
+ temp_ja_bert = temp_ja_bert[:, 1:]
241
+ temp_en_bert = temp_en_bert[:, 1:]
242
+ temp_phones = temp_phones[1:]
243
+ temp_tones = temp_tones[1:]
244
+ temp_lang_ids = temp_lang_ids[1:]
245
+ if skip_end:
246
+ temp_bert = temp_bert[:, :-1]
247
+ temp_ja_bert = temp_ja_bert[:, :-1]
248
+ temp_en_bert = temp_en_bert[:, :-1]
249
+ temp_phones = temp_phones[:-1]
250
+ temp_tones = temp_tones[:-1]
251
+ temp_lang_ids = temp_lang_ids[:-1]
252
+ bert.append(temp_bert)
253
+ ja_bert.append(temp_ja_bert)
254
+ en_bert.append(temp_en_bert)
255
+ phones.append(temp_phones)
256
+ tones.append(temp_tones)
257
+ lang_ids.append(temp_lang_ids)
258
+ bert = torch.concatenate(bert, dim=1)
259
+ ja_bert = torch.concatenate(ja_bert, dim=1)
260
+ en_bert = torch.concatenate(en_bert, dim=1)
261
+ phones = torch.concatenate(phones, dim=0)
262
+ tones = torch.concatenate(tones, dim=0)
263
+ lang_ids = torch.concatenate(lang_ids, dim=0)
264
+ with torch.no_grad():
265
+ x_tst = phones.to(device).unsqueeze(0)
266
+ tones = tones.to(device).unsqueeze(0)
267
+ lang_ids = lang_ids.to(device).unsqueeze(0)
268
+ bert = bert.to(device).unsqueeze(0)
269
+ ja_bert = ja_bert.to(device).unsqueeze(0)
270
+ en_bert = en_bert.to(device).unsqueeze(0)
271
+ x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
272
+ del phones
273
+ speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
274
+ audio = (
275
+ net_g.infer(
276
+ x_tst,
277
+ x_tst_lengths,
278
+ speakers,
279
+ tones,
280
+ lang_ids,
281
+ bert,
282
+ ja_bert,
283
+ en_bert,
284
+ sdp_ratio=sdp_ratio,
285
+ noise_scale=noise_scale,
286
+ noise_scale_w=noise_scale_w,
287
+ length_scale=length_scale,
288
+ )[0][0, 0]
289
+ .data.cpu()
290
+ .float()
291
+ .numpy()
292
+ )
293
+ del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers, ja_bert, en_bert
294
+ if torch.cuda.is_available():
295
+ torch.cuda.empty_cache()
296
+ return audio