Mahiruoshi commited on
Commit
c25a639
·
1 Parent(s): d289ff1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -26
app.py CHANGED
@@ -69,6 +69,9 @@ def extrac(text):
69
  text = re.sub("<[^>]*>","",text)
70
  result_list = re.split(r'\n', text)
71
  final_list = []
 
 
 
72
  for i in result_list:
73
  if is_english(i):
74
  i = romajitable.to_kana(i).katakana
@@ -153,15 +156,21 @@ def get_text(text,hps_ms):
153
  text_norm = torch.LongTensor(text_norm)
154
  return text_norm
155
 
156
- def vc_fn(original_speaker, target_speaker, record_audio, upload_audio):
157
  input_audio = record_audio if record_audio is not None else upload_audio
158
- if input_audio is None:
159
- return "You need to record or upload an audio", None
160
- sampling_rate, audio = input_audio
161
  original_speaker_id = selection(original_speaker)
162
  target_speaker_id = selection(target_speaker)
163
-
164
- audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
 
 
 
 
 
 
 
 
 
165
  if len(audio.shape) > 1:
166
  audio = librosa.to_mono(audio.transpose(1, 0))
167
  if sampling_rate != hps.data.sampling_rate:
@@ -334,16 +343,26 @@ def selection(speaker):
334
  else:
335
  return 0
336
 
 
 
 
 
 
 
 
337
  def create_tts_fn(net_g,hps,speaker_id):
338
  speaker_id = int(speaker_id)
339
  def tts_fn(is_gpt,api_key,is_audio,audiopath,repeat_time,text, language, extract, n_scale= 0.667,n_scale_w = 0.8, l_scale = 1 ):
 
340
  repeat_ime = int(repeat_time)
341
  if is_gpt:
342
  openai.api_key = api_key
343
  text,messages = chatgpt(text)
344
  htm = to_html(messages)
345
  else:
346
- htm = ''
 
 
347
  if not extract:
348
  t1 = time.time()
349
  stn_tst = get_text(sle(language,text),hps)
@@ -417,7 +436,7 @@ if __name__ == '__main__':
417
  hps = utils.get_hparams_from_file('checkpoints/tmp/config.json')
418
  dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
419
  models = []
420
- schools = ["Nijigasaki","ShojoKageki","ShojoKageki-Nijigasaki"]
421
  lan = ["中文","日文","自动","手动"]
422
  with open("checkpoints/info.json", "r", encoding="utf-8") as f:
423
  models_info = json.load(f)
@@ -444,12 +463,13 @@ if __name__ == '__main__':
444
  name = speakers[j]["name"]
445
  content.append((sid, name, title, example, create_tts_fn(net_g,hps,sid)))
446
  models.append(content)
447
- idols = ["c1","c2","高咲侑","歩夢","かすみ","しずく","果林","愛","彼方","せつ菜","璃奈","栞子","エマ","ランジュ","ミア","華恋","まひる","なな","クロディーヌ","ひかり",'純那',"香子","真矢","双葉","ミチル","メイファン","やちよ","晶","いちえ","ゆゆ子","塁","珠緒","あるる","ララフィン","美空","静羽","あるる"]
448
  with gr.Blocks() as app:
449
  with gr.Tabs():
450
  for i in schools:
451
  with gr.TabItem(i):
 
452
  for (sid, name, title, example, tts_fn) in models[schools.index(i)]:
 
453
  with gr.TabItem(name):
454
  with gr.Column():
455
  with gr.Row():
@@ -471,35 +491,45 @@ if __name__ == '__main__':
471
  input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.6)
472
  input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.668)
473
  input6 = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1)
474
- with gr.Accordion(label="Advanced Setting(GPT3.5接口+长句子合成,建议克隆本仓库后运行main.py)", open=False):
475
  input3 = gr.Checkbox(value=False, label="长句切割(小说合成)")
 
 
476
  output2 = gr.outputs.File(label="字幕文件:subtitles.srt")
477
  api_input1 = gr.Checkbox(value=False, label="接入chatgpt")
478
  api_input2 = gr.TextArea(label="api-key",lines=1,value = '见 https://openai.com/blog/openai-api')
479
  audio_input1 = gr.Checkbox(value=False, label="修改音频路径(live2d)")
480
  audio_input2 = gr.TextArea(label="音频路径",lines=1,value = '#参考 D:/app_develop/live2d_whole/2010002/sounds/temp.wav')
481
  audio_input3 = gr.Dropdown(label="重复生成次数", choices=list(range(101)), value='0', interactive=True)
 
482
  btnVC.click(tts_fn, inputs=[api_input1,api_input2,audio_input1,audio_input2,audio_input3,input1,input2,input3,input4,input5,input6], outputs=[output1,output2,output_UI])
483
- with gr.Tab("Voice Conversion(弱化版sovits)"):
484
- gr.Markdown("""
485
- 录制或上传声音,并选择要转换的音色。
486
- """)
487
- with gr.Column():
488
- record_audio = gr.Audio(label="record your voice", source="microphone")
489
- upload_audio = gr.Audio(label="or upload audio here", source="upload")
490
- source_speaker = gr.Dropdown(choices=idols, value="歩夢", label="source speaker")
491
- target_speaker = gr.Dropdown(choices=idols, value="まひる", label="target speaker")
492
- with gr.Column():
493
- message_box = gr.Textbox(label="Message")
494
- converted_audio = gr.Audio(label='converted audio')
495
- btn = gr.Button("Convert!")
496
- btn.click(vc_fn, inputs=[source_speaker, target_speaker, record_audio, upload_audio],
497
- outputs=[message_box, converted_audio])
 
 
 
 
 
 
 
498
  with gr.Tab("说明"):
499
  gr.Markdown(
500
  "### <center> 请不要生成会对个人以及企划造成侵害的内容,自觉遵守相关法律,静止商业使用或让他人产生困扰\n"
501
  "<div align='center'>从左到右分别是虹团,少歌中文特化版,以及五校混合版。这三个均为不同的模型,效果也有差异</div>\n"
502
  "<div align='center'>因为我会时不时地更新模型,所以会碰到平台抽风问题,大部分情���下一天就能恢复了。</div>\n"
503
- '<div align="center"><a>参数说明:这个十分玄学,我还没找到最合适的,如果效果不佳可以将噪声比例和噪声偏差调节至0。按照经验,合成日语时也可以将噪声比例调节至0.2-0.3区间,语调会正常一些。duration代表整体语速,1.0大部分情况应该就够了</div>'
504
  '<div align="center"><a>建议只在平台上体验最基础的功能,强烈建议将该仓库克隆至本地或者于colab运行 main.py或app.py</div>')
505
  app.launch()
 
69
  text = re.sub("<[^>]*>","",text)
70
  result_list = re.split(r'\n', text)
71
  final_list = []
72
+ if not torch.cuda.is_available():
73
+ if len(final_list) > 10:
74
+ return ['对不起,做不到']
75
  for i in result_list:
76
  if is_english(i):
77
  i = romajitable.to_kana(i).katakana
 
156
  text_norm = torch.LongTensor(text_norm)
157
  return text_norm
158
 
159
+ def vc_fn(text,language,n_scale,n_scale_w,l_scale,original_speaker, target_speaker, record_audio, upload_audio):
160
  input_audio = record_audio if record_audio is not None else upload_audio
 
 
 
161
  original_speaker_id = selection(original_speaker)
162
  target_speaker_id = selection(target_speaker)
163
+ if input_audio is None:
164
+ stn_tst = get_text(sle(language,text),hps)
165
+ with torch.no_grad():
166
+ x_tst = stn_tst.unsqueeze(0).to(dev)
167
+ x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
168
+ sid = torch.LongTensor([original_speaker_id]).to(dev)
169
+ audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
170
+ sampling_rate = hps.data.sampling_rate
171
+ else:
172
+ sampling_rate, audio = input_audio
173
+ audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
174
  if len(audio.shape) > 1:
175
  audio = librosa.to_mono(audio.transpose(1, 0))
176
  if sampling_rate != hps.data.sampling_rate:
 
343
  else:
344
  return 0
345
 
346
+ def check_text(input):
347
+ if isinstance(input, str):
348
+ return input
349
+ else:
350
+ with open(input.name, "r", encoding="utf-8") as f:
351
+ return f.read()
352
+
353
  def create_tts_fn(net_g,hps,speaker_id):
354
  speaker_id = int(speaker_id)
355
  def tts_fn(is_gpt,api_key,is_audio,audiopath,repeat_time,text, language, extract, n_scale= 0.667,n_scale_w = 0.8, l_scale = 1 ):
356
+ text = check_text(text)
357
  repeat_ime = int(repeat_time)
358
  if is_gpt:
359
  openai.api_key = api_key
360
  text,messages = chatgpt(text)
361
  htm = to_html(messages)
362
  else:
363
+ messages = []
364
+ messages.append({"role": "assistant", "content": text})
365
+ htm = to_html(messages)
366
  if not extract:
367
  t1 = time.time()
368
  stn_tst = get_text(sle(language,text),hps)
 
436
  hps = utils.get_hparams_from_file('checkpoints/tmp/config.json')
437
  dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
438
  models = []
439
+ schools = ["ShojoKageki-Nijigasaki","ShojoKageki","Nijigasaki"]
440
  lan = ["中文","日文","自动","手动"]
441
  with open("checkpoints/info.json", "r", encoding="utf-8") as f:
442
  models_info = json.load(f)
 
463
  name = speakers[j]["name"]
464
  content.append((sid, name, title, example, create_tts_fn(net_g,hps,sid)))
465
  models.append(content)
 
466
  with gr.Blocks() as app:
467
  with gr.Tabs():
468
  for i in schools:
469
  with gr.TabItem(i):
470
+ idols = ["派蒙"]
471
  for (sid, name, title, example, tts_fn) in models[schools.index(i)]:
472
+ idols.append(name)
473
  with gr.TabItem(name):
474
  with gr.Column():
475
  with gr.Row():
 
491
  input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.6)
492
  input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.668)
493
  input6 = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1)
494
+ with gr.Accordion(label="Advanced Setting(GPT3.5接口+小说合成,建议克隆本仓库后运行main.py)", open=False):
495
  input3 = gr.Checkbox(value=False, label="长句切割(小说合成)")
496
+ inputxt = gr.File(label="Text")
497
+ btnbook = gr.Button("小说合成")
498
  output2 = gr.outputs.File(label="字幕文件:subtitles.srt")
499
  api_input1 = gr.Checkbox(value=False, label="接入chatgpt")
500
  api_input2 = gr.TextArea(label="api-key",lines=1,value = '见 https://openai.com/blog/openai-api')
501
  audio_input1 = gr.Checkbox(value=False, label="修改音频路径(live2d)")
502
  audio_input2 = gr.TextArea(label="音频路径",lines=1,value = '#参考 D:/app_develop/live2d_whole/2010002/sounds/temp.wav')
503
  audio_input3 = gr.Dropdown(label="重复生成次数", choices=list(range(101)), value='0', interactive=True)
504
+ btnbook.click(tts_fn, inputs=[api_input1,api_input2,audio_input1,audio_input2,audio_input3,inputxt,input2,input3,input4,input5,input6], outputs=[output1,output2,output_UI])
505
  btnVC.click(tts_fn, inputs=[api_input1,api_input2,audio_input1,audio_input2,audio_input3,input1,input2,input3,input4,input5,input6], outputs=[output1,output2,output_UI])
506
+ with gr.Tab("Voice Conversion(就是sovits的原理)"):
507
+ gr.Markdown("""
508
+ 声线转化,使用模型中的说话人作为音源时效果更佳
509
+ """)
510
+ with gr.Column():
511
+ with gr.Accordion(label="方法1:录制或上传声音,可进行歌声合成", open=False):
512
+ record_audio = gr.Audio(label="record your voice", source="microphone")
513
+ upload_audio = gr.Audio(label="or upload audio here", source="upload")
514
+ with gr.Accordion(label="方法2:由原说话人先进行tts后套娃,适用于合成中文等特殊场景", open=True):
515
+ text = gr.TextArea(label="Text", value='由源说话人进行语音转化',lines = 1)
516
+ language = gr.Dropdown(label="Language", choices=lan, value="自动", interactive=True)
517
+ n_scale = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.6)
518
+ n_scale_w = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.668)
519
+ l_scale = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1.1)
520
+ source_speaker = gr.Dropdown(choices=idols, value=idols[-2], label="source speaker")
521
+ target_speaker = gr.Dropdown(choices=idols, value=idols[-3], label="target speaker")
522
+ with gr.Column():
523
+ message_box = gr.Textbox(label="Message")
524
+ converted_audio = gr.Audio(label='converted audio')
525
+ btn = gr.Button("Convert!")
526
+ btn.click(vc_fn, inputs=[text,language,n_scale,n_scale_w,l_scale,source_speaker, target_speaker, record_audio, upload_audio],
527
+ outputs=[message_box, converted_audio])
528
  with gr.Tab("说明"):
529
  gr.Markdown(
530
  "### <center> 请不要生成会对个人以及企划造成侵害的内容,自觉遵守相关法律,静止商业使用或让他人产生困扰\n"
531
  "<div align='center'>从左到右分别是虹团,少歌中文特化版,以及五校混合版。这三个均为不同的模型,效果也有差异</div>\n"
532
  "<div align='center'>因为我会时不时地更新模型,所以会碰到平台抽风问题,大部分情���下一天就能恢复了。</div>\n"
533
+ '<div align="center"><a>参数说明:这个十分玄学,我还没找到最合适的,如果效果不佳可以将噪声比例和噪声偏差调节至0,这回完全随机化音频源。按照经验,合成日语时也可以将噪声比例调节至0.2-0.3区间,语调会正常一些。duration代表整体语速,可视情况调至1.1或1.2</div>'
534
  '<div align="center"><a>建议只在平台上体验最基础的功能,强烈建议将该仓库克隆至本地或者于colab运行 main.py或app.py</div>')
535
  app.launch()