Mahiruoshi commited on
Commit
534942c
·
verified ·
1 Parent(s): 146c591

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +225 -13
app.py CHANGED
@@ -15,7 +15,8 @@ logging.basicConfig(
15
  )
16
 
17
  logger = logging.getLogger(__name__)
18
-
 
19
  import librosa
20
  import numpy as np
21
  import torch
@@ -23,7 +24,7 @@ import torch.nn as nn
23
  from torch.utils.data import Dataset
24
  from torch.utils.data import DataLoader, Dataset
25
  from tqdm import tqdm
26
- from clap_wrapper import get_clap_audio_feature, get_clap_text_feature
27
 
28
 
29
  import gradio as gr
@@ -40,7 +41,7 @@ import utils
40
  from models import SynthesizerTrn
41
  from text.symbols import symbols
42
  import sys
43
-
44
  from tools.translate import translate
45
 
46
  net_g = None
@@ -132,9 +133,10 @@ def infer(
132
  sid,
133
  style_text=None,
134
  style_weight=0.7,
 
135
  ):
136
-
137
- language= 'JP' if is_japanese(text) else 'ZH'
138
  bert, ja_bert, en_bert, phones, tones, lang_ids = get_text(
139
  text,
140
  language,
@@ -198,6 +200,146 @@ def loadmodel(model):
198
  _ = utils.load_checkpoint(model, net_g, None, skip_optimizer=True)
199
  return "success"
200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  if __name__ == "__main__":
202
  languages = [ "Auto", "ZH", "JP"]
203
  modelPaths = []
@@ -206,7 +348,7 @@ if __name__ == "__main__":
206
  modelPaths.append(os.path.join(dirpath, filename))
207
  hps = utils.get_hparams_from_file('Data/BangDream/configs/config.json')
208
  net_g = get_net_g(
209
- model_path=modelPaths[-1], device=device, hps=hps
210
  )
211
  speaker_ids = hps.data.spk2id
212
  speakers = list(speaker_ids.keys())
@@ -217,7 +359,7 @@ if __name__ == "__main__":
217
  [好玩的](http://love.soyorin.top/)\n
218
  该界面的真实链接(国内可用): https://mahiruoshi-bangdream-bert-vits2.hf.space/\n
219
  API: https://mahiruoshi-bert-vits2-api.hf.space/ \n
220
- 调用方式: https://mahiruoshi-bert-vits2-api.hf.space/?text=%E4%B8%BA%E4%BB%80%E4%B9%88%E8%A6%81%E6%BC%94%E5%A5%8F%E6%98%A5%E6%97%A5%E5%BD%B1&speaker=%E9%A6%99%E6%BE%84\n
221
  推荐搭配[Legado开源阅读](https://github.com/gedoor/legado)或[聊天bot](https://github.com/Paraworks/BangDreamAi)使用\n
222
  二创请标注作者:B站@Mahiroshi: https://space.bilibili.com/19874615\n
223
  训练数据集归属:BangDream及少歌手游,提取自BestDori,[数据集获取流程](https://nijigaku.top/2023/09/29/Bestbushiroad%E8%AE%A1%E5%88%92-vits-%E9%9F%B3%E9%A2%91%E6%8A%93%E5%8F%96%E5%8F%8A%E6%95%B0%E6%8D%AE%E9%9B%86%E5%AF%B9%E9%BD%90/)\n
@@ -238,6 +380,9 @@ if __name__ == "__main__":
238
  length_scale = gr.Slider(
239
  minimum=0.1, maximum=2, value=1, step=0.01, label="语速调节"
240
  )
 
 
 
241
  with gr.Accordion(label="参数设定", open=True):
242
  sdp_ratio = gr.Slider(
243
  minimum=0, maximum=1, value=0.5, step=0.01, label="SDP/DP混合比"
@@ -262,7 +407,12 @@ if __name__ == "__main__":
262
  info="输入纯日语或者中文",
263
  value="我是来结束这个乐队的。",
264
  )
265
- style_text = gr.Textbox(label="辅助文本",info="语言保持跟主文本一致",placeholder="为什么要演奏春日影!")
 
 
 
 
 
266
  style_weight = gr.Slider(
267
  minimum=0,
268
  maximum=1,
@@ -274,7 +424,7 @@ if __name__ == "__main__":
274
  btn = gr.Button("点击生成", variant="primary")
275
  audio_output = gr.Audio(label="Output Audio")
276
  btntran = gr.Button("快速中翻日")
277
- translateResult = gr.TextArea(label="百度翻译",value="从这里翻译后的文本")
278
  btntran.click(translate, inputs=[text], outputs = [translateResult])
279
 
280
  btn.click(
@@ -288,14 +438,76 @@ if __name__ == "__main__":
288
  speaker,
289
  style_text,
290
  style_weight,
 
291
  ],
292
  outputs=[audio_output],
293
  )
294
  with gr.TabItem('少歌在2.2版本'):
295
- gr.Markdown(
296
- '<div align="center">'
297
- f'<iframe style="width:100%;height:400px;" src="https://mahiruoshi-mygo-vits-bert.hf.space/" frameborder="0"></iframe>'
298
- '</div>'
299
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
300
  print("推理页面已开启!")
301
  app.launch(share=True)
 
15
  )
16
 
17
  logger = logging.getLogger(__name__)
18
+ import shutil
19
+ from scipy.io.wavfile import write
20
  import librosa
21
  import numpy as np
22
  import torch
 
24
  from torch.utils.data import Dataset
25
  from torch.utils.data import DataLoader, Dataset
26
  from tqdm import tqdm
27
+ from tools.sentence import extrac, is_japanese, is_chinese, seconds_to_ass_time, extract_text_from_file, remove_annotations,extract_and_convert
28
 
29
 
30
  import gradio as gr
 
41
  from models import SynthesizerTrn
42
  from text.symbols import symbols
43
  import sys
44
+ import re
45
  from tools.translate import translate
46
 
47
  net_g = None
 
133
  sid,
134
  style_text=None,
135
  style_weight=0.7,
136
+ language = "Auto",
137
  ):
138
+ if language == "Auto":
139
+ language= 'JP' if is_japanese(text) else 'ZH'
140
  bert, ja_bert, en_bert, phones, tones, lang_ids = get_text(
141
  text,
142
  language,
 
200
  _ = utils.load_checkpoint(model, net_g, None, skip_optimizer=True)
201
  return "success"
202
 
203
+ def generate_audio_and_srt_for_group(group, outputPath, group_index, sampling_rate, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime):
204
+ audio_fin = []
205
+ ass_entries = []
206
+ start_time = 0
207
+ #speaker = random.choice(cara_list)
208
+ ass_header = """[Script Info]
209
+ ; 我没意见
210
+ Title: Audiobook
211
+ ScriptType: v4.00+
212
+ WrapStyle: 0
213
+ PlayResX: 640
214
+ PlayResY: 360
215
+ ScaledBorderAndShadow: yes
216
+ [V4+ Styles]
217
+ Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
218
+ Style: Default,Arial,20,&H00FFFFFF,&H000000FF,&H00000000,&H00000000,0,0,0,0,100,100,0,0,1,1,1,2,10,10,10,1
219
+ [Events]
220
+ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
221
+ """
222
+
223
+ for sentence in group:
224
+ try:
225
+ FakeSpeaker = sentence.split("|")[0]
226
+ print(FakeSpeaker)
227
+ SpeakersList = re.split('\n', spealerList)
228
+ if FakeSpeaker in list(hps.data.spk2id.keys()):
229
+ speaker = FakeSpeaker
230
+ for i in SpeakersList:
231
+ if FakeSpeaker == i.split("|")[1]:
232
+ speaker = i.split("|")[0]
233
+ if sentence != '\n':
234
+ audio = infer_simple((remove_annotations(sentence.split("|")[-1]).replace(" ","")+"。").replace(",。","。").replace("。。","。"), sdp_ratio, noise_scale, noise_scale_w, length_scale,speaker)
235
+ silence_frames = int(silenceTime * 44010) if is_chinese(sentence) else int(silenceTime * 44010)
236
+ silence_data = np.zeros((silence_frames,), dtype=audio.dtype)
237
+ audio_fin.append(audio)
238
+ audio_fin.append(silence_data)
239
+
240
+ duration = len(audio) / sampling_rate
241
+ print(duration)
242
+ end_time = start_time + duration + silenceTime
243
+ ass_entries.append("Dialogue: 0,{},{},".format(seconds_to_ass_time(start_time), seconds_to_ass_time(end_time)) + "Default,,0,0,0,,{}".format(sentence.replace("|",":")))
244
+ start_time = end_time
245
+ except:
246
+ pass
247
+ wav_filename = os.path.join(outputPath, f'audiobook_part_{group_index}.wav')
248
+ ass_filename = os.path.join(outputPath, f'audiobook_part_{group_index}.ass')
249
+
250
+ write(wav_filename, sampling_rate, np.concatenate(audio_fin))
251
+
252
+ with open(ass_filename, 'w', encoding='utf-8') as f:
253
+ f.write(ass_header + '\n'.join(ass_entries))
254
+ return (hps.data.sampling_rate, np.concatenate(audio_fin))
255
+
256
+ def audiobook(inputFile, groupsize, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime,filepath,raw_text):
257
+ directory_path = filepath if torch.cuda.is_available() else "books"
258
+
259
+ if os.path.exists(directory_path):
260
+ shutil.rmtree(directory_path)
261
+
262
+ os.makedirs(directory_path)
263
+ if inputFile:
264
+ text = extract_text_from_file(inputFile.name)
265
+ else:
266
+ text = raw_text
267
+ sentences = extrac(extract_and_convert(text))
268
+ GROUP_SIZE = groupsize
269
+ for i in range(0, len(sentences), GROUP_SIZE):
270
+ group = sentences[i:i+GROUP_SIZE]
271
+ if spealerList == "":
272
+ spealerList = "无"
273
+ result = generate_audio_and_srt_for_group(group,directory_path, i//GROUP_SIZE + 1, 44100, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime)
274
+ if not torch.cuda.is_available():
275
+ return result
276
+ return result
277
+
278
+ def infer_simple(
279
+ text,
280
+ sdp_ratio,
281
+ noise_scale,
282
+ noise_scale_w,
283
+ length_scale,
284
+ sid,
285
+ style_text=None,
286
+ style_weight=0.7,
287
+ ):
288
+ if is_chinese(text) or is_japanese(text):
289
+ if len(text) > 1:
290
+ language= 'JP' if is_japanese(text) else 'ZH'
291
+ bert, ja_bert, en_bert, phones, tones, lang_ids = get_text(
292
+ text,
293
+ language,
294
+ hps,
295
+ device,
296
+ style_text="",
297
+ style_weight=0,
298
+ )
299
+ with torch.no_grad():
300
+ x_tst = phones.to(device).unsqueeze(0)
301
+ tones = tones.to(device).unsqueeze(0)
302
+ lang_ids = lang_ids.to(device).unsqueeze(0)
303
+ bert = bert.to(device).unsqueeze(0)
304
+ ja_bert = ja_bert.to(device).unsqueeze(0)
305
+ en_bert = en_bert.to(device).unsqueeze(0)
306
+ x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
307
+ # emo = emo.to(device).unsqueeze(0)
308
+ del phones
309
+ speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
310
+ audio = (
311
+ net_g.infer(
312
+ x_tst,
313
+ x_tst_lengths,
314
+ speakers,
315
+ tones,
316
+ lang_ids,
317
+ bert,
318
+ ja_bert,
319
+ en_bert,
320
+ sdp_ratio=sdp_ratio,
321
+ noise_scale=noise_scale,
322
+ noise_scale_w=noise_scale_w,
323
+ length_scale=length_scale,
324
+ )[0][0, 0]
325
+ .data.cpu()
326
+ .float()
327
+ .numpy()
328
+ )
329
+ del (
330
+ x_tst,
331
+ tones,
332
+ lang_ids,
333
+ bert,
334
+ x_tst_lengths,
335
+ speakers,
336
+ ja_bert,
337
+ en_bert,
338
+ ) # , emo
339
+ if torch.cuda.is_available():
340
+ torch.cuda.empty_cache()
341
+ return audio
342
+
343
  if __name__ == "__main__":
344
  languages = [ "Auto", "ZH", "JP"]
345
  modelPaths = []
 
348
  modelPaths.append(os.path.join(dirpath, filename))
349
  hps = utils.get_hparams_from_file('Data/BangDream/configs/config.json')
350
  net_g = get_net_g(
351
+ model_path="Data/BangDream/models/G_1536000.pth", device=device, hps=hps
352
  )
353
  speaker_ids = hps.data.spk2id
354
  speakers = list(speaker_ids.keys())
 
359
  [好玩的](http://love.soyorin.top/)\n
360
  该界面的真实链接(国内可用): https://mahiruoshi-bangdream-bert-vits2.hf.space/\n
361
  API: https://mahiruoshi-bert-vits2-api.hf.space/ \n
362
+ 调用方式: https://mahiruoshi-bert-vits2-api.hf.space/?text={{speakText}}&speaker=chosen_speaker\n
363
  推荐搭配[Legado开源阅读](https://github.com/gedoor/legado)或[聊天bot](https://github.com/Paraworks/BangDreamAi)使用\n
364
  二创请标注作者:B站@Mahiroshi: https://space.bilibili.com/19874615\n
365
  训练数据集归属:BangDream及少歌手游,提取自BestDori,[数据集获取流程](https://nijigaku.top/2023/09/29/Bestbushiroad%E8%AE%A1%E5%88%92-vits-%E9%9F%B3%E9%A2%91%E6%8A%93%E5%8F%96%E5%8F%8A%E6%95%B0%E6%8D%AE%E9%9B%86%E5%AF%B9%E9%BD%90/)\n
 
380
  length_scale = gr.Slider(
381
  minimum=0.1, maximum=2, value=1, step=0.01, label="语速调节"
382
  )
383
+ language = gr.Dropdown(
384
+ choices=languages, value="Auto", label="语言"
385
+ )
386
  with gr.Accordion(label="参数设定", open=True):
387
  sdp_ratio = gr.Slider(
388
  minimum=0, maximum=1, value=0.5, step=0.01, label="SDP/DP混合比"
 
407
  info="输入纯日语或者中文",
408
  value="我是来结束这个乐队的。",
409
  )
410
+ style_text = gr.Textbox(
411
+ label="情感辅助文本",
412
+ info="语言保持跟主文本一致,文本可以参考训练集:https://huggingface.co/spaces/Mahiruoshi/BangDream-Bert-VITS2/blob/main/filelists/Mygo.list)",
413
+ placeholder="使用辅助文本的语意来辅助生成对话(语���保持与主文本相同)\n\n"
414
+ "**注意**:不要使用**指令式文本**(如:开心),要使用**带有强烈情感的文本**(如:我好快乐!!!)"
415
+ )
416
  style_weight = gr.Slider(
417
  minimum=0,
418
  maximum=1,
 
424
  btn = gr.Button("点击生成", variant="primary")
425
  audio_output = gr.Audio(label="Output Audio")
426
  btntran = gr.Button("快速中翻日")
427
+ translateResult = gr.TextArea(label="使用百度翻译",placeholder="从这里复制翻译后的文本")
428
  btntran.click(translate, inputs=[text], outputs = [translateResult])
429
 
430
  btn.click(
 
438
  speaker,
439
  style_text,
440
  style_weight,
441
+ language,
442
  ],
443
  outputs=[audio_output],
444
  )
445
  with gr.TabItem('少歌在2.2版本'):
446
+ gr.Markdown(value="""
447
+ <div align="center">
448
+ <iframe style="width:100%;height:400px;" src="https://mahiruoshi-mygo-vits-bert.hf.space/" frameborder="0"></iframe>'
449
+ </div>"""
450
  )
451
+ with gr.Tab('拓展功能'):
452
+ with gr.Row():
453
+ with gr.Column():
454
+ gr.Markdown(
455
+ f"从 <a href='https://nijigaku.top/2023/10/03/BangDreamTTS/'>我的博客站点</a> 查看自制galgame使用说明\n</a>"
456
+ )
457
+ inputFile = gr.UploadButton(label="txt文件输入")
458
+ raw_text = gr.TextArea(
459
+ label="文本输入",
460
+ info="输入纯日语或者中文",
461
+ value="つくし|我是来结束这个乐队的。",
462
+ )
463
+ groupSize = gr.Slider(
464
+ minimum=10, maximum=1000 if torch.cuda.is_available() else 50,value = 50, step=1, label="单个音频文件包含的最大字数"
465
+ )
466
+ silenceTime = gr.Slider(
467
+ minimum=0, maximum=1, value=0.5, step=0.01, label="句子的间隔"
468
+ )
469
+ filepath = gr.TextArea(
470
+ label="本地合成时的音频存储文件夹(会清空文件夹)",
471
+ value = "D:/audiobook/book1",
472
+ )
473
+ spealerList = gr.TextArea(
474
+ label="角色对应表,左边是你想要在每一句话合成中用到的speaker(见角色清单)右边是你上传文本时分隔符左边设置的说话人:{ChoseSpeakerFromConfigList}|{SeakerInUploadText}",
475
+ placeholder = "ましろ|真白\n七深|七深\n透子|透子\nつくし|筑紫\n瑠唯|瑠唯\nそよ|素世\n祥子|祥子",
476
+ )
477
+ speaker = gr.Dropdown(
478
+ choices=speakers, value = "ましろ", label="选择默认说话人"
479
+ )
480
+ with gr.Column():
481
+ sdp_ratio = gr.Slider(
482
+ minimum=0, maximum=1, value=0.2, step=0.01, label="SDP/DP混合比"
483
+ )
484
+ noise_scale = gr.Slider(
485
+ minimum=0.1, maximum=2, value=0.6, step=0.01, label="感情调节"
486
+ )
487
+ noise_scale_w = gr.Slider(
488
+ minimum=0.1, maximum=2, value=0.667, step=0.01, label="音素长度"
489
+ )
490
+ length_scale = gr.Slider(
491
+ minimum=0.1, maximum=2, value=1, step=0.01, label="生成长度"
492
+ )
493
+ LastAudioOutput = gr.Audio(label="当使用cuda时才能在本地文件夹浏览全部文件")
494
+ btn2 = gr.Button("点击生成", variant="primary")
495
+ btn2.click(
496
+ audiobook,
497
+ inputs=[
498
+ inputFile,
499
+ groupSize,
500
+ speaker,
501
+ sdp_ratio,
502
+ noise_scale,
503
+ noise_scale_w,
504
+ length_scale,
505
+ spealerList,
506
+ silenceTime,
507
+ filepath,
508
+ raw_text
509
+ ],
510
+ outputs=[LastAudioOutput],
511
+ )
512
  print("推理页面已开启!")
513
  app.launch(share=True)