Ailyth commited on
Commit
2eccd3d
1 Parent(s): e70c011

0220-152614-some_fix

Browse files
Files changed (3) hide show
  1. app.py +38 -31
  2. text/chinese.py +2 -2
  3. text/english.py +4 -4
app.py CHANGED
@@ -1,13 +1,3 @@
1
- import logging
2
- logging.getLogger("markdown_it").setLevel(logging.ERROR)
3
- logging.getLogger("urllib3").setLevel(logging.ERROR)
4
- logging.getLogger("httpcore").setLevel(logging.ERROR)
5
- logging.getLogger("httpx").setLevel(logging.ERROR)
6
- logging.getLogger("asyncio").setLevel(logging.ERROR)
7
- logging.getLogger("charset_normalizer").setLevel(logging.ERROR)
8
- logging.getLogger("torchaudio._extension").setLevel(logging.ERROR)
9
- logging.getLogger("multipart").setLevel(logging.WARNING)
10
-
11
  import gradio as gr
12
  import numpy as np
13
  import soundfile as sf
@@ -26,6 +16,18 @@ from transformers.pipelines.audio_utils import ffmpeg_read
26
  from transformers import AutoModelForMaskedLM, AutoTokenizer
27
  from AR.models.t2s_lightning_module import Text2SemanticLightningModule
28
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  if "_CUDA_VISIBLE_DEVICES" in os.environ:
30
  os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
31
  tz = pytz.timezone('Asia/Singapore')
@@ -365,9 +367,9 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
365
  startTime=timer()
366
  text=trim_text(text,text_language)
367
  change_sovits_weights(sovits_path)
368
- tprint(f'👌LOADED SoVITS Model: {sovits_path}')
369
  change_gpt_weights(gpt_path)
370
- tprint(f'👌LOADED GPT Model: {gpt_path}')
371
 
372
  prompt_language = dict_language[prompt_language]
373
  text_language = dict_language[text_language]
@@ -375,8 +377,8 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
375
  if (prompt_text[-1] not in splits): prompt_text += "。" if prompt_language != "en" else "."
376
  text = text.strip("\n")
377
  if (text[0] not in splits and len(get_first(text)) < 4): text = "。" + text if text_language != "en" else "." + text
378
- print(("实际输入的参考文本:"), prompt_text)
379
- print(("📝实际输入的目标文本:"), text)
380
  zero_wav = np.zeros(
381
  int(hps.data.sampling_rate * 0.3),
382
  dtype=np.float16 if is_half == True else np.float32,
@@ -418,7 +420,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
418
  text = cut5(text)
419
  while "\n\n" in text:
420
  text = text.replace("\n\n", "\n")
421
- print(("实际输入的目标文本(切句后):"), text)
422
  texts = text.split("\n")
423
  texts = merge_short_text_in_array(texts, 5)
424
  audio_opt = []
@@ -428,7 +430,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
428
  if (len(text.strip()) == 0):
429
  continue
430
  if (text[-1] not in splits): text += "。" if text_language != "en" else "."
431
- print(("实际输入的目标文本(每句):"), text)
432
  phones2, word2ph2, norm_text2 = get_cleaned_text_final(text, text_language)
433
  bert2 = get_bert_final(phones2, word2ph2, norm_text2, text_language, device).to(dtype)
434
  bert = torch.cat([bert1, bert2], 1)
@@ -561,13 +563,16 @@ def cut5(inp):
561
  # if not re.search(r'[^\w\s]', inp[-1]):
562
  # inp += '。'
563
  inp = inp.strip("\n")
564
- punds = r'[,.;?!、,。?!;:]'
565
  items = re.split(f'({punds})', inp)
566
- items = ["".join(group) for group in zip(items[::2], items[1::2])]
567
- opt = "\n".join(items)
 
 
568
  return opt
569
 
570
 
 
571
  def custom_sort_key(s):
572
  # 使用正则表达式提取字符串中的数字部分和非数字部分
573
  parts = re.split('(\d+)', s)
@@ -580,7 +585,7 @@ def tprint(text):
580
  print(f'UTC+8 - {now} - {text}')
581
 
582
  def wprint(text):
583
- print(text)
584
  gr.Warning(text)
585
 
586
  #裁切文本
@@ -589,11 +594,13 @@ def trim_text(text,language):
589
  limit_en = 60 #words
590
  search_limit_cj = limit_cj+30
591
  search_limit_en = limit_en +30
 
 
592
  if language =='English':
593
  words = text.split()
594
  if len(words) <= limit_en:
595
  return text
596
- # 对英文文本进行处理
597
  for i in range(limit_en, -1, -1):
598
  if any(punct in words[i] for punct in splits):
599
  return ' '.join(words[:i+1])
@@ -605,13 +612,13 @@ def trim_text(text,language):
605
  else:#中文日文
606
  if len(text) <= limit_cj:
607
  return text
608
- for i in range(limit_cj, -1, -1): # 向前搜索
609
  if text[i] in splits:
610
  return text[:i+1]
611
- for i in range(limit_cj, min(len(text), search_limit_cj)): # 向后搜索,但不超过search_limit
612
  if text[i] in splits:
613
  return text[:i+1]
614
- return text[:limit_cj] # 如果没有找到标点,或者超过搜索限制,直接裁切到limit
615
 
616
  def duration(audio_file_path):
617
  try:
@@ -670,7 +677,7 @@ def transcribe(voice):
670
 
671
  time2=timer()
672
  tprint(f'transcribe COMPLETE,{round(time2-time1,4)}s')
673
- tprint(f'\n 🔣Transcribed audio:\n 🔣Language:{language} \n 🔣Text:{text}' )
674
  return text,language
675
 
676
  def clone_voice(user_voice,user_text,user_lang):
@@ -679,7 +686,7 @@ def clone_voice(user_voice,user_text,user_lang):
679
  if user_text == '':
680
  wprint("Please enter text to generate/请输入生成文字")
681
  return None
682
- tprint('⚡Start clone')
683
  user_text=trim_text(user_text,user_lang)
684
  time1=timer()
685
  global gpt_path, sovits_path
@@ -736,9 +743,9 @@ with gr.Blocks(theme='Kasien/ali_theme_custom') as app:
736
  chinese_models = [name for name, _ in models_by_language["中文"]]
737
  japanese_models = [name for name, _ in models_by_language["日本語"]]
738
  with gr.Row():
739
- english_choice = gr.Radio(english_models, label="EN|English Model",value="Trump")
740
- chinese_choice = gr.Radio(chinese_models, label="CN|中文模型")
741
- japanese_choice = gr.Radio(japanese_models, label="JP|日本語モデル")
742
 
743
  plsh='Text must match the selected language option to prevent errors, for example, if English is input but Chinese is selected for generation.\n文字一定要和语言选项匹配,不然要报错,比如输入的是英文,生成语言选中文'
744
  limit='Max 70 words. Excess will be ignored./单次最多处理120字左右,多余的会被忽略'
@@ -784,7 +791,7 @@ with gr.Blocks(theme='Kasien/ali_theme_custom') as app:
784
  interactive=True,
785
  info='A suitable splitting method can achieve better generation results'
786
  )
787
- volume = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.01, label='Volume')
788
 
789
 
790
 
@@ -809,7 +816,7 @@ with gr.Blocks(theme='Kasien/ali_theme_custom') as app:
809
  placeholder=plsh,info=limit)
810
 
811
  user_button = gr.Button("✨Clone Voice", variant="primary")
812
- user_output = gr.Audio(label="💾Output wave file,Download it by clicking ⬇️")
813
 
814
  gr.HTML('''<div align=center><img id="visitor-badge" alt="visitor badge" src="https://visitor-badge.laobi.icu/badge?page_id=Ailyth/DLMP9" /></div>''')
815
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import numpy as np
3
  import soundfile as sf
 
16
  from transformers import AutoModelForMaskedLM, AutoTokenizer
17
  from AR.models.t2s_lightning_module import Text2SemanticLightningModule
18
 
19
+
20
+ import logging
21
+ logging.getLogger("markdown_it").setLevel(logging.ERROR)
22
+ logging.getLogger("urllib3").setLevel(logging.ERROR)
23
+ logging.getLogger("httpcore").setLevel(logging.ERROR)
24
+ logging.getLogger("httpx").setLevel(logging.ERROR)
25
+ logging.getLogger("asyncio").setLevel(logging.ERROR)
26
+ logging.getLogger("charset_normalizer").setLevel(logging.ERROR)
27
+ logging.getLogger("torchaudio._extension").setLevel(logging.ERROR)
28
+ logging.getLogger("multipart").setLevel(logging.WARNING)
29
+
30
+
31
  if "_CUDA_VISIBLE_DEVICES" in os.environ:
32
  os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
33
  tz = pytz.timezone('Asia/Singapore')
 
367
  startTime=timer()
368
  text=trim_text(text,text_language)
369
  change_sovits_weights(sovits_path)
370
+ tprint(f'🏕️LOADED SoVITS Model: {sovits_path}')
371
  change_gpt_weights(gpt_path)
372
+ tprint(f'🏕️LOADED GPT Model: {gpt_path}')
373
 
374
  prompt_language = dict_language[prompt_language]
375
  text_language = dict_language[text_language]
 
377
  if (prompt_text[-1] not in splits): prompt_text += "。" if prompt_language != "en" else "."
378
  text = text.strip("\n")
379
  if (text[0] not in splits and len(get_first(text)) < 4): text = "。" + text if text_language != "en" else "." + text
380
+ #print(("实际输入的参考文本:"), prompt_text)
381
+ #print(("📝实际输入的目标文本:"), text)
382
  zero_wav = np.zeros(
383
  int(hps.data.sampling_rate * 0.3),
384
  dtype=np.float16 if is_half == True else np.float32,
 
420
  text = cut5(text)
421
  while "\n\n" in text:
422
  text = text.replace("\n\n", "\n")
423
+ print(f"🧨实际输入的目标文本(切句后):{text}\n")
424
  texts = text.split("\n")
425
  texts = merge_short_text_in_array(texts, 5)
426
  audio_opt = []
 
430
  if (len(text.strip()) == 0):
431
  continue
432
  if (text[-1] not in splits): text += "。" if text_language != "en" else "."
433
+ print(("\n🎈实际输入的目标文本(每句):"), text)
434
  phones2, word2ph2, norm_text2 = get_cleaned_text_final(text, text_language)
435
  bert2 = get_bert_final(phones2, word2ph2, norm_text2, text_language, device).to(dtype)
436
  bert = torch.cat([bert1, bert2], 1)
 
563
  # if not re.search(r'[^\w\s]', inp[-1]):
564
  # inp += '。'
565
  inp = inp.strip("\n")
566
+ punds = r'[,.;?!、,。?!;:…]'
567
  items = re.split(f'({punds})', inp)
568
+ mergeitems = ["".join(group) for group in zip(items[::2], items[1::2])]
569
+ if len(items)%2 == 1:
570
+ mergeitems.append(items[-1])
571
+ opt = "\n".join(mergeitems)
572
  return opt
573
 
574
 
575
+
576
  def custom_sort_key(s):
577
  # 使用正则表达式提取字符串中的数字部分和非数字部分
578
  parts = re.split('(\d+)', s)
 
585
  print(f'UTC+8 - {now} - {text}')
586
 
587
  def wprint(text):
588
+ tprint(text)
589
  gr.Warning(text)
590
 
591
  #裁切文本
 
594
  limit_en = 60 #words
595
  search_limit_cj = limit_cj+30
596
  search_limit_en = limit_en +30
597
+ text = text.replace('\n', '').strip()
598
+
599
  if language =='English':
600
  words = text.split()
601
  if len(words) <= limit_en:
602
  return text
603
+ # English
604
  for i in range(limit_en, -1, -1):
605
  if any(punct in words[i] for punct in splits):
606
  return ' '.join(words[:i+1])
 
612
  else:#中文日文
613
  if len(text) <= limit_cj:
614
  return text
615
+ for i in range(limit_cj, -1, -1):
616
  if text[i] in splits:
617
  return text[:i+1]
618
+ for i in range(limit_cj, min(len(text), search_limit_cj)):
619
  if text[i] in splits:
620
  return text[:i+1]
621
+ return text[:limit_cj]
622
 
623
  def duration(audio_file_path):
624
  try:
 
677
 
678
  time2=timer()
679
  tprint(f'transcribe COMPLETE,{round(time2-time1,4)}s')
680
+ tprint(f'\n🔣转录结果:\n 🔣Language:{language} \n 🔣Text:{text}' )
681
  return text,language
682
 
683
  def clone_voice(user_voice,user_text,user_lang):
 
686
  if user_text == '':
687
  wprint("Please enter text to generate/请输入生成文字")
688
  return None
689
+ #tprint('⚡Start clone')
690
  user_text=trim_text(user_text,user_lang)
691
  time1=timer()
692
  global gpt_path, sovits_path
 
743
  chinese_models = [name for name, _ in models_by_language["中文"]]
744
  japanese_models = [name for name, _ in models_by_language["日本語"]]
745
  with gr.Row():
746
+ english_choice = gr.Radio(english_models, label="EN|English Model",value="Trump",scale=3)
747
+ chinese_choice = gr.Radio(chinese_models, label="CN|中文模型",scale=2)
748
+ japanese_choice = gr.Radio(japanese_models, label="JP|日本語モデル",scale=4)
749
 
750
  plsh='Text must match the selected language option to prevent errors, for example, if English is input but Chinese is selected for generation.\n文字一定要和语言选项匹配,不然要报错,比如输入的是英文,生成语言选中文'
751
  limit='Max 70 words. Excess will be ignored./单次最多处理120字左右,多余的会被忽略'
 
791
  interactive=True,
792
  info='A suitable splitting method can achieve better generation results'
793
  )
794
+ volume = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.01, label='Volume/音量')
795
 
796
 
797
 
 
816
  placeholder=plsh,info=limit)
817
 
818
  user_button = gr.Button("✨Clone Voice", variant="primary")
819
+ user_output = gr.Audio(label="💾Download it by clicking ⬇️")
820
 
821
  gr.HTML('''<div align=center><img id="visitor-badge" alt="visitor badge" src="https://visitor-badge.laobi.icu/badge?page_id=Ailyth/DLMP9" /></div>''')
822
 
text/chinese.py CHANGED
@@ -30,7 +30,7 @@ rep_map = {
30
  "\n": ".",
31
  "·": ",",
32
  "、": ",",
33
- "...": "…",
34
  "$": ".",
35
  "/": ",",
36
  "—": "-",
@@ -169,4 +169,4 @@ if __name__ == "__main__":
169
 
170
  # # 示例用法
171
  # text = "这是一个示例文本:,你好!这是一个测试..."
172
- # print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试
 
30
  "\n": ".",
31
  "·": ",",
32
  "、": ",",
33
+ # "...": "…",
34
  "$": ".",
35
  "/": ",",
36
  "—": "-",
 
169
 
170
  # # 示例用法
171
  # text = "这是一个示例文本:,你好!这是一个测试..."
172
+ # print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试
text/english.py CHANGED
@@ -169,9 +169,9 @@ def read_dict_new():
169
  line = line.strip()
170
  word_split = line.split(" ")
171
  word = word_split[0]
172
- if word not in g2p_dict:
173
- g2p_dict[word] = []
174
- g2p_dict[word].append(word_split[1:])
175
 
176
  line_index = line_index + 1
177
  line = f.readline()
@@ -231,4 +231,4 @@ if __name__ == "__main__":
231
  # for group in syllables:
232
  # for ph in group:
233
  # all_phones.add(ph)
234
- # print(all_phones)
 
169
  line = line.strip()
170
  word_split = line.split(" ")
171
  word = word_split[0]
172
+ #if word not in g2p_dict:
173
+ g2p_dict[word] = []
174
+ g2p_dict[word].append(word_split[1:])
175
 
176
  line_index = line_index + 1
177
  line = f.readline()
 
231
  # for group in syllables:
232
  # for ph in group:
233
  # all_phones.add(ph)
234
+ # print(all_phones)