Mahiruoshi commited on
Commit
dea6e28
1 Parent(s): 8eaa3cc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -12
app.py CHANGED
@@ -44,6 +44,41 @@ import sys
44
  import re
45
  from tools.translate import translate
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  net_g = None
48
 
49
  device = (
@@ -134,7 +169,10 @@ def infer(
134
  style_text=None,
135
  style_weight=0.7,
136
  language = "Auto",
 
137
  ):
 
 
138
  if language == "Auto":
139
  language= 'JP' if is_japanese(text) else 'ZH'
140
  bert, ja_bert, en_bert, phones, tones, lang_ids = get_text(
@@ -200,7 +238,7 @@ def loadmodel(model):
200
  _ = utils.load_checkpoint(model, net_g, None, skip_optimizer=True)
201
  return "success"
202
 
203
- def generate_audio_and_srt_for_group(group, outputPath, group_index, sampling_rate, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime):
204
  audio_fin = []
205
  ass_entries = []
206
  start_time = 0
@@ -231,7 +269,17 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
231
  if FakeSpeaker == i.split("|")[1]:
232
  speaker = i.split("|")[0]
233
  if sentence != '\n':
234
- audio = infer_simple((remove_annotations(sentence.split("|")[-1]).replace(" ","")+"。").replace(",。","。").replace("。。","。"), sdp_ratio, noise_scale, noise_scale_w, length_scale,speaker)
 
 
 
 
 
 
 
 
 
 
235
  silence_frames = int(silenceTime * 44010) if is_chinese(sentence) else int(silenceTime * 44010)
236
  silence_data = np.zeros((silence_frames,), dtype=audio.dtype)
237
  audio_fin.append(audio)
@@ -253,7 +301,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
253
  f.write(ass_header + '\n'.join(ass_entries))
254
  return (hps.data.sampling_rate, np.concatenate(audio_fin))
255
 
256
- def audiobook(inputFile, groupsize, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime,filepath,raw_text):
257
  directory_path = filepath if torch.cuda.is_available() else "books"
258
 
259
  if os.path.exists(directory_path):
@@ -264,13 +312,17 @@ def audiobook(inputFile, groupsize, speaker, sdp_ratio, noise_scale, noise_scale
264
  text = extract_text_from_file(inputFile.name)
265
  else:
266
  text = raw_text
267
- sentences = extrac(extract_and_convert(text))
 
 
 
 
268
  GROUP_SIZE = groupsize
269
  for i in range(0, len(sentences), GROUP_SIZE):
270
  group = sentences[i:i+GROUP_SIZE]
271
  if spealerList == "":
272
  spealerList = "无"
273
- result = generate_audio_and_srt_for_group(group,directory_path, i//GROUP_SIZE + 1, 44100, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime)
274
  if not torch.cuda.is_available():
275
  return result
276
  return result
@@ -282,9 +334,17 @@ def infer_simple(
282
  noise_scale_w,
283
  length_scale,
284
  sid,
285
- style_text=None,
286
- style_weight=0.7,
287
  ):
 
 
 
 
 
 
 
 
288
  if is_chinese(text) or is_japanese(text):
289
  if len(text) > 1:
290
  language= 'JP' if is_japanese(text) else 'ZH'
@@ -341,6 +401,7 @@ def infer_simple(
341
  return audio
342
 
343
  if __name__ == "__main__":
 
344
  languages = [ "Auto", "ZH", "JP"]
345
  modelPaths = []
346
  for dirpath, dirnames, filenames in os.walk('Data/BangDream/models/'):
@@ -383,6 +444,7 @@ if __name__ == "__main__":
383
  language = gr.Dropdown(
384
  choices=languages, value="Auto", label="语言"
385
  )
 
386
  with gr.Accordion(label="参数设定", open=True):
387
  sdp_ratio = gr.Slider(
388
  minimum=0, maximum=1, value=0.5, step=0.01, label="SDP/DP混合比"
@@ -439,6 +501,7 @@ if __name__ == "__main__":
439
  style_text,
440
  style_weight,
441
  language,
 
442
  ],
443
  outputs=[audio_output],
444
  )
@@ -458,8 +521,12 @@ if __name__ == "__main__":
458
  raw_text = gr.TextArea(
459
  label="文本输入",
460
  info="输入纯日语或者中文",
461
- value="つくし|我是来结束这个乐队的。",
 
 
 
462
  )
 
463
  groupSize = gr.Slider(
464
  minimum=10, maximum=1000 if torch.cuda.is_available() else 50,value = 50, step=1, label="单个音频文件包含的最大字数"
465
  )
@@ -472,14 +539,14 @@ if __name__ == "__main__":
472
  )
473
  spealerList = gr.TextArea(
474
  label="角色对应表,左边是你想要在每一句话合成中用到的speaker(见角色清单)右边是你上传文本时分隔符左边设置的说话人:{ChoseSpeakerFromConfigList}|{SeakerInUploadText}",
475
- placeholder = "ましろ|真白\n七深|七深\n透子|透子\nつくし|筑紫\n瑠唯|瑠唯\nそよ|素世\n祥子|祥子",
476
  )
477
  speaker = gr.Dropdown(
478
  choices=speakers, value = "ましろ", label="选择默认说话人"
479
  )
480
  with gr.Column():
481
  sdp_ratio = gr.Slider(
482
- minimum=0, maximum=1, value=0.2, step=0.01, label="SDP/DP混合比"
483
  )
484
  noise_scale = gr.Slider(
485
  minimum=0.1, maximum=2, value=0.6, step=0.01, label="感情调节"
@@ -505,9 +572,11 @@ if __name__ == "__main__":
505
  spealerList,
506
  silenceTime,
507
  filepath,
508
- raw_text
 
 
509
  ],
510
  outputs=[LastAudioOutput],
511
  )
512
  print("推理页面已开启!")
513
- app.launch(share=True)
 
44
  import re
45
  from tools.translate import translate
46
 
47
+ from fugashi import Tagger
48
+ import jaconv
49
+ import unidic
50
+ import subprocess
51
+
52
+ def download_unidic():
53
+ try:
54
+ Tagger()
55
+ print("Tagger launch successfully.")
56
+ except Exception as e:
57
+ print("UNIDIC dictionary not found, downloading...")
58
+ subprocess.run([sys.executable, "-m", "unidic", "download"])
59
+ print("Download completed.")
60
+
61
+
62
+ def kanji_to_hiragana(text):
63
+ tagger = Tagger()
64
+ output = ""
65
+
66
+ # 更新正则表达式以更准确地区分文本和标点符号
67
+ segments = re.findall(r'[一-龥ぁ-んァ-ン\w]+|[^\一-龥ぁ-んァ-ン\w\s]', text, re.UNICODE)
68
+
69
+ for segment in segments:
70
+ if re.match(r'[一-龥ぁ-んァ-ン\w]+', segment):
71
+ # 如果是单词或汉字,转换为平假名
72
+ for word in tagger(segment):
73
+ kana = word.feature.kana or word.surface
74
+ hiragana = jaconv.kata2hira(kana) # 将片假名转换为平假名
75
+ output += hiragana
76
+ else:
77
+ # 如果是标点符号,保持不变
78
+ output += segment
79
+
80
+ return output
81
+
82
  net_g = None
83
 
84
  device = (
 
169
  style_text=None,
170
  style_weight=0.7,
171
  language = "Auto",
172
+ fugashi = True
173
  ):
174
+ if fugashi:
175
+ text = kanji_to_hiragana(text) if is_japanese(text) else text
176
  if language == "Auto":
177
  language= 'JP' if is_japanese(text) else 'ZH'
178
  bert, ja_bert, en_bert, phones, tones, lang_ids = get_text(
 
238
  _ = utils.load_checkpoint(model, net_g, None, skip_optimizer=True)
239
  return "success"
240
 
241
+ def generate_audio_and_srt_for_group(group, outputPath, group_index, sampling_rate, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime,language_force,fugashi = True):
242
  audio_fin = []
243
  ass_entries = []
244
  start_time = 0
 
269
  if FakeSpeaker == i.split("|")[1]:
270
  speaker = i.split("|")[0]
271
  if sentence != '\n':
272
+ text = (remove_annotations(sentence.split("|")[-1]).replace(" ","")+"。").replace(",。","。")
273
+ audio = infer_simple(
274
+ text,
275
+ sdp_ratio,
276
+ noise_scale,
277
+ noise_scale_w,
278
+ length_scale,
279
+ speaker,
280
+ language_force,
281
+ fugashi
282
+ )
283
  silence_frames = int(silenceTime * 44010) if is_chinese(sentence) else int(silenceTime * 44010)
284
  silence_data = np.zeros((silence_frames,), dtype=audio.dtype)
285
  audio_fin.append(audio)
 
301
  f.write(ass_header + '\n'.join(ass_entries))
302
  return (hps.data.sampling_rate, np.concatenate(audio_fin))
303
 
304
+ def audiobook(inputFile, groupsize, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime,filepath,raw_text,language_force,fugashi):
305
  directory_path = filepath if torch.cuda.is_available() else "books"
306
 
307
  if os.path.exists(directory_path):
 
312
  text = extract_text_from_file(inputFile.name)
313
  else:
314
  text = raw_text
315
+ if language_force == 'None':
316
+ sentences = extrac(extract_and_convert(text))
317
+ else:
318
+ sentences = extrac(text)
319
+
320
  GROUP_SIZE = groupsize
321
  for i in range(0, len(sentences), GROUP_SIZE):
322
  group = sentences[i:i+GROUP_SIZE]
323
  if spealerList == "":
324
  spealerList = "无"
325
+ result = generate_audio_and_srt_for_group(group,directory_path, i//GROUP_SIZE + 1, 44100, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime,language_force,fugashi)
326
  if not torch.cuda.is_available():
327
  return result
328
  return result
 
334
  noise_scale_w,
335
  length_scale,
336
  sid,
337
+ language_force = "None",
338
+ fugashi = True
339
  ):
340
+
341
+ if language_force == "JP":
342
+ text = translate(text,"jp")
343
+ if language_force == "ZH":
344
+ text = translate(text,"zh")
345
+ if fugashi:
346
+ text = kanji_to_hiragana(text) if is_japanese(text) else text
347
+ print(text)
348
  if is_chinese(text) or is_japanese(text):
349
  if len(text) > 1:
350
  language= 'JP' if is_japanese(text) else 'ZH'
 
401
  return audio
402
 
403
  if __name__ == "__main__":
404
+ download_unidic()
405
  languages = [ "Auto", "ZH", "JP"]
406
  modelPaths = []
407
  for dirpath, dirnames, filenames in os.walk('Data/BangDream/models/'):
 
444
  language = gr.Dropdown(
445
  choices=languages, value="Auto", label="语言"
446
  )
447
+ fugashi = gr.Checkbox(label="转化为片假名")
448
  with gr.Accordion(label="参数设定", open=True):
449
  sdp_ratio = gr.Slider(
450
  minimum=0, maximum=1, value=0.5, step=0.01, label="SDP/DP混合比"
 
501
  style_text,
502
  style_weight,
503
  language,
504
+ fugashi
505
  ],
506
  outputs=[audio_output],
507
  )
 
521
  raw_text = gr.TextArea(
522
  label="文本输入",
523
  info="输入纯日语或者中文",
524
+ value="筑紫|我是来结束这个乐队的。",
525
+ )
526
+ language_force = gr.Dropdown(
527
+ choices=[ "None", "ZH", "JP"], value="None", label="将文本翻译为目标语言"
528
  )
529
+ fugashi = gr.Checkbox(label="转化为片假名")
530
  groupSize = gr.Slider(
531
  minimum=10, maximum=1000 if torch.cuda.is_available() else 50,value = 50, step=1, label="单个音频文件包含的最大字数"
532
  )
 
539
  )
540
  spealerList = gr.TextArea(
541
  label="角色对应表,左边是你想要在每一句话合成中用到的speaker(见角色清单)右边是你上传文本时分隔符左边设置的说话人:{ChoseSpeakerFromConfigList}|{SeakerInUploadText}",
542
+ value = "ましろ|真白\n七深|七深\n透子|透子\nつくし|筑紫\n瑠唯|瑠唯\nそよ|素世\n祥子|祥子",
543
  )
544
  speaker = gr.Dropdown(
545
  choices=speakers, value = "ましろ", label="选择默认说话人"
546
  )
547
  with gr.Column():
548
  sdp_ratio = gr.Slider(
549
+ minimum=0, maximum=1, value=0.5, step=0.01, label="SDP/DP混合比"
550
  )
551
  noise_scale = gr.Slider(
552
  minimum=0.1, maximum=2, value=0.6, step=0.01, label="感情调节"
 
572
  spealerList,
573
  silenceTime,
574
  filepath,
575
+ raw_text,
576
+ language_force,
577
+ fugashi
578
  ],
579
  outputs=[LastAudioOutput],
580
  )
581
  print("推理页面已开启!")
582
+ app.launch()