XzJosh commited on
Commit
6ae47ab
·
verified ·
1 Parent(s): bf0dde6

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -120
app.py CHANGED
@@ -9,25 +9,10 @@ logging.getLogger("charset_normalizer").setLevel(logging.ERROR)
9
  logging.getLogger("torchaudio._extension").setLevel(logging.ERROR)
10
  import pdb
11
 
12
- if os.path.exists("./gweight.txt"):
13
- with open("./gweight.txt", 'r',encoding="utf-8") as file:
14
- gweight_data = file.read()
15
- gpt_path = os.environ.get(
16
- "gpt_path", gweight_data)
17
- else:
18
- gpt_path = os.environ.get(
19
- "gpt_path", "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt")
20
-
21
- if os.path.exists("./sweight.txt"):
22
- with open("./sweight.txt", 'r',encoding="utf-8") as file:
23
- sweight_data = file.read()
24
- sovits_path = os.environ.get("sovits_path", sweight_data)
25
- else:
26
- sovits_path = os.environ.get("sovits_path", "GPT_SoVITS/pretrained_models/s2G488k.pth")
27
- # gpt_path = os.environ.get(
28
- # "gpt_path", "pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
29
- # )
30
- # sovits_path = os.environ.get("sovits_path", "pretrained_models/s2G488k.pth")
31
  cnhubert_base_path = os.environ.get(
32
  "cnhubert_base_path", "pretrained_models/chinese-hubert-base"
33
  )
@@ -36,8 +21,6 @@ bert_path = os.environ.get(
36
  )
37
  infer_ttswebui = os.environ.get("infer_ttswebui", 9872)
38
  infer_ttswebui = int(infer_ttswebui)
39
- is_share = os.environ.get("is_share", "False")
40
- is_share=eval(is_share)
41
  if "_CUDA_VISIBLE_DEVICES" in os.environ:
42
  os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
43
  is_half = eval(os.environ.get("is_half", "True"))
@@ -47,6 +30,10 @@ import numpy as np
47
  import librosa,torch
48
  from feature_extractor import cnhubert
49
  cnhubert.cnhubert_base_path=cnhubert_base_path
 
 
 
 
50
 
51
  from module.models import SynthesizerTrn
52
  from AR.models.t2s_lightning_module import Text2SemanticLightningModule
@@ -55,17 +42,12 @@ from text.cleaner import clean_text
55
  from time import time as ttime
56
  from module.mel_processing import spectrogram_torch
57
  from my_utils import load_audio
58
- from tools.i18n.i18n import I18nAuto
59
- i18n = I18nAuto()
60
 
61
- os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 确保直接启动推理UI时也能够设置。
62
 
63
- if torch.cuda.is_available():
64
- device = "cuda"
65
- elif torch.backends.mps.is_available():
66
- device = "mps"
67
- else:
68
- device = "cpu"
69
 
70
  tokenizer = AutoTokenizer.from_pretrained(bert_path)
71
  bert_model = AutoModelForMaskedLM.from_pretrained(bert_path)
@@ -74,11 +56,12 @@ if is_half == True:
74
  else:
75
  bert_model = bert_model.to(device)
76
 
 
77
  def get_bert_feature(text, word2ph):
78
  with torch.no_grad():
79
  inputs = tokenizer(text, return_tensors="pt")
80
  for i in inputs:
81
- inputs[i] = inputs[i].to(device)
82
  res = bert_model(**inputs, output_hidden_states=True)
83
  res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1]
84
  assert len(word2ph) == len(text)
@@ -116,7 +99,6 @@ class DictToAttrRecursive(dict):
116
  except KeyError:
117
  raise AttributeError(f"Attribute {item} not found")
118
 
119
-
120
  ssl_model = cnhubert.get_model()
121
  if is_half == True:
122
  ssl_model = ssl_model.half().to(device)
@@ -143,7 +125,6 @@ def change_sovits_weights(sovits_path):
143
  vq_model = vq_model.to(device)
144
  vq_model.eval()
145
  print(vq_model.load_state_dict(dict_s2["weight"], strict=False))
146
- with open("./sweight.txt","w",encoding="utf-8")as f:f.write(sovits_path)
147
  change_sovits_weights(sovits_path)
148
 
149
  def change_gpt_weights(gpt_path):
@@ -160,9 +141,9 @@ def change_gpt_weights(gpt_path):
160
  t2s_model.eval()
161
  total = sum([param.nelement() for param in t2s_model.parameters()])
162
  print("Number of parameter: %.2fM" % (total / 1e6))
163
- with open("./gweight.txt","w",encoding="utf-8")as f:f.write(gpt_path)
164
  change_gpt_weights(gpt_path)
165
 
 
166
  def get_spepc(hps, filename):
167
  audio = load_audio(filename, int(hps.data.sampling_rate))
168
  audio = torch.FloatTensor(audio)
@@ -211,8 +192,6 @@ def clean_text_inf(text, language):
211
  phones = cleaned_text_to_sequence(phones)
212
 
213
  return phones, word2ph, norm_text
214
-
215
-
216
  def get_bert_inf(phones, word2ph, norm_text, language):
217
  if language == "zh":
218
  bert = get_bert_feature(norm_text, word2ph).to(device)
@@ -292,7 +271,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
292
  t1 = ttime()
293
  prompt_language = dict_language[prompt_language]
294
  text_language = dict_language[text_language]
295
-
296
  if prompt_language == "en":
297
  phones1, word2ph1, norm_text1 = clean_text_inf(prompt_text, prompt_language)
298
  else:
@@ -309,7 +288,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
309
  bert1 = get_bert_inf(phones1, word2ph1, norm_text1, prompt_language)
310
  else:
311
  bert1 = nonen_get_bert_inf(prompt_text, prompt_language)
312
-
313
  for text in texts:
314
  # 解决输入目标文本的空行导致报错的问题
315
  if (len(text.strip()) == 0):
@@ -323,7 +302,6 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
323
  bert2 = get_bert_inf(phones2, word2ph2, norm_text2, text_language)
324
  else:
325
  bert2 = nonen_get_bert_inf(text, text_language)
326
-
327
  bert = torch.cat([bert1, bert2], 1)
328
 
329
  all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0)
@@ -446,96 +424,86 @@ def cut2(inp):
446
  def cut3(inp):
447
  inp = inp.strip("\n")
448
  return "\n".join(["%s。" % item for item in inp.strip("。").split("。")])
449
- def cut4(inp):
450
- inp = inp.strip("\n")
451
- return "\n".join(["%s." % item for item in inp.strip(".").split(".")])
452
-
453
- def custom_sort_key(s):
454
- # 使用正则表达式提取字符串中的数字部分和非数字部分
455
- parts = re.split('(\d+)', s)
456
- # 将数字部分转换为整数,非数字部分保持不变
457
- parts = [int(part) if part.isdigit() else part for part in parts]
458
- return parts
459
-
460
- def change_choices():
461
- SoVITS_names, GPT_names = get_weights_names()
462
- return {"choices": sorted(SoVITS_names,key=custom_sort_key), "__type__": "update"}, {"choices": sorted(GPT_names,key=custom_sort_key), "__type__": "update"}
463
-
464
- pretrained_sovits_name="GPT_SoVITS/pretrained_models/s2G488k.pth"
465
- pretrained_gpt_name="GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
466
- SoVITS_weight_root="SoVITS_weights"
467
- GPT_weight_root="GPT_weights"
468
- os.makedirs(SoVITS_weight_root,exist_ok=True)
469
- os.makedirs(GPT_weight_root,exist_ok=True)
470
- def get_weights_names():
471
- SoVITS_names = [pretrained_sovits_name]
472
- for name in os.listdir(SoVITS_weight_root):
473
- if name.endswith(".pth"):SoVITS_names.append("%s/%s"%(SoVITS_weight_root,name))
474
- GPT_names = [pretrained_gpt_name]
475
- for name in os.listdir(GPT_weight_root):
476
- if name.endswith(".ckpt"): GPT_names.append("%s/%s"%(GPT_weight_root,name))
477
- return SoVITS_names,GPT_names
478
- SoVITS_names,GPT_names = get_weights_names()
479
 
480
  with gr.Blocks(title="GPT-SoVITS WebUI") as app:
481
- gr.Markdown(
482
- value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>.")
483
- )
 
 
 
 
 
 
 
 
 
 
484
  with gr.Group():
485
- gr.Markdown(value=i18n("模型切换"))
486
- with gr.Row():
487
- GPT_dropdown = gr.Dropdown(label=i18n("GPT模型列表"), choices=sorted(GPT_names, key=custom_sort_key), value=gpt_path,interactive=True)
488
- SoVITS_dropdown = gr.Dropdown(label=i18n("SoVITS模型列表"), choices=sorted(SoVITS_names, key=custom_sort_key), value=sovits_path,interactive=True)
489
- refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary")
490
- refresh_button.click(fn=change_choices, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown])
491
- SoVITS_dropdown.change(change_sovits_weights,[SoVITS_dropdown],[])
492
- GPT_dropdown.change(change_gpt_weights,[GPT_dropdown],[])
493
- gr.Markdown(value=i18n("*请上传并填写参考信息"))
494
  with gr.Row():
495
- inp_ref = gr.Audio(label=i18n("请上传参考音频"), type="filepath")
496
- prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="")
497
- prompt_language = gr.Dropdown(
498
- label=i18n("参考音频的语种"),choices=[i18n("中文"),i18n("英文"),i18n("日文")],value=i18n("中文")
499
- )
500
- gr.Markdown(value=i18n("*请填写需要合成的目标文本。中英混合选中文,日英混合选日文,中日混合暂不支持,非目标语言文本自动遗弃。"))
 
 
 
 
 
 
 
 
 
 
 
501
  with gr.Row():
502
- text = gr.Textbox(label=i18n("需要合成的文本"), value="")
503
  text_language = gr.Dropdown(
504
- label=i18n("需要合成的语种"),choices=[i18n("中文"),i18n("英文"),i18n("日文")],value=i18n("中文")
505
- )
506
- how_to_cut = gr.Radio(
507
- label=i18n("怎么切"),
508
- choices=[i18n("不切"),i18n("凑五句一切"),i18n("凑50字一切"),i18n("按中文句号。切"),i18n("按英文句号.切"),],
509
- value=i18n("凑50字一切"),
510
- interactive=True,
511
  )
512
- inference_button = gr.Button(i18n("合成语音"), variant="primary")
513
- output = gr.Audio(label=i18n("输出的语音"))
514
-
515
  inference_button.click(
516
  get_tts_wav,
517
- [inp_ref, prompt_text, prompt_language, text, text_language,how_to_cut],
518
  [output],
519
  )
520
 
521
- gr.Markdown(value=i18n("文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。"))
522
- with gr.Row():
523
- text_inp = gr.Textbox(label=i18n("需要合成的切分前文本"),value="")
524
- button1 = gr.Button(i18n("凑五句一切"), variant="primary")
525
- button2 = gr.Button(i18n("凑50字一切"), variant="primary")
526
- button3 = gr.Button(i18n("按中文句号。切"), variant="primary")
527
- button4 = gr.Button(i18n("按英文句号.切"), variant="primary")
528
- text_opt = gr.Textbox(label=i18n("切分后文本"), value="")
529
- button1.click(cut1, [text_inp], [text_opt])
530
- button2.click(cut2, [text_inp], [text_opt])
531
- button3.click(cut3, [text_inp], [text_opt])
532
- button4.click(cut4, [text_inp], [text_opt])
533
- gr.Markdown(value=i18n("后续将支持混合语种编码文本输入。"))
534
-
535
- app.queue(concurrency_count=511, max_size=1022).launch(
536
- server_name="0.0.0.0",
537
- inbrowser=True,
538
- share=is_share,
539
- server_port=infer_ttswebui,
540
- quiet=True,
541
- )
 
9
  logging.getLogger("torchaudio._extension").setLevel(logging.ERROR)
10
  import pdb
11
 
12
+ gpt_path = os.environ.get(
13
+ "gpt_path", "models/Taffy/Taffy-e5.ckpt"
14
+ )
15
+ sovits_path = os.environ.get("sovits_path", "models/Taffy/Taffy_e20_s1020.pth")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  cnhubert_base_path = os.environ.get(
17
  "cnhubert_base_path", "pretrained_models/chinese-hubert-base"
18
  )
 
21
  )
22
  infer_ttswebui = os.environ.get("infer_ttswebui", 9872)
23
  infer_ttswebui = int(infer_ttswebui)
 
 
24
  if "_CUDA_VISIBLE_DEVICES" in os.environ:
25
  os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
26
  is_half = eval(os.environ.get("is_half", "True"))
 
30
  import librosa,torch
31
  from feature_extractor import cnhubert
32
  cnhubert.cnhubert_base_path=cnhubert_base_path
33
+ import ssl
34
+ ssl._create_default_https_context = ssl._create_unverified_context
35
+ import nltk
36
+ nltk.download('cmudict')
37
 
38
  from module.models import SynthesizerTrn
39
  from AR.models.t2s_lightning_module import Text2SemanticLightningModule
 
42
  from time import time as ttime
43
  from module.mel_processing import spectrogram_torch
44
  from my_utils import load_audio
 
 
45
 
46
+ device = "cuda" if torch.cuda.is_available() else "cpu"
47
 
48
+ is_half = eval(
49
+ os.environ.get("is_half", "True" if torch.cuda.is_available() else "False")
50
+ )
 
 
 
51
 
52
  tokenizer = AutoTokenizer.from_pretrained(bert_path)
53
  bert_model = AutoModelForMaskedLM.from_pretrained(bert_path)
 
56
  else:
57
  bert_model = bert_model.to(device)
58
 
59
+
60
  def get_bert_feature(text, word2ph):
61
  with torch.no_grad():
62
  inputs = tokenizer(text, return_tensors="pt")
63
  for i in inputs:
64
+ inputs[i] = inputs[i].to(device) #####输入是long不用管精度问题,精度随bert_model
65
  res = bert_model(**inputs, output_hidden_states=True)
66
  res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1]
67
  assert len(word2ph) == len(text)
 
99
  except KeyError:
100
  raise AttributeError(f"Attribute {item} not found")
101
 
 
102
  ssl_model = cnhubert.get_model()
103
  if is_half == True:
104
  ssl_model = ssl_model.half().to(device)
 
125
  vq_model = vq_model.to(device)
126
  vq_model.eval()
127
  print(vq_model.load_state_dict(dict_s2["weight"], strict=False))
 
128
  change_sovits_weights(sovits_path)
129
 
130
  def change_gpt_weights(gpt_path):
 
141
  t2s_model.eval()
142
  total = sum([param.nelement() for param in t2s_model.parameters()])
143
  print("Number of parameter: %.2fM" % (total / 1e6))
 
144
  change_gpt_weights(gpt_path)
145
 
146
+
147
  def get_spepc(hps, filename):
148
  audio = load_audio(filename, int(hps.data.sampling_rate))
149
  audio = torch.FloatTensor(audio)
 
192
  phones = cleaned_text_to_sequence(phones)
193
 
194
  return phones, word2ph, norm_text
 
 
195
  def get_bert_inf(phones, word2ph, norm_text, language):
196
  if language == "zh":
197
  bert = get_bert_feature(norm_text, word2ph).to(device)
 
271
  t1 = ttime()
272
  prompt_language = dict_language[prompt_language]
273
  text_language = dict_language[text_language]
274
+
275
  if prompt_language == "en":
276
  phones1, word2ph1, norm_text1 = clean_text_inf(prompt_text, prompt_language)
277
  else:
 
288
  bert1 = get_bert_inf(phones1, word2ph1, norm_text1, prompt_language)
289
  else:
290
  bert1 = nonen_get_bert_inf(prompt_text, prompt_language)
291
+
292
  for text in texts:
293
  # 解决输入目标文本的空行导致报错的问题
294
  if (len(text.strip()) == 0):
 
302
  bert2 = get_bert_inf(phones2, word2ph2, norm_text2, text_language)
303
  else:
304
  bert2 = nonen_get_bert_inf(text, text_language)
 
305
  bert = torch.cat([bert1, bert2], 1)
306
 
307
  all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0)
 
424
  def cut3(inp):
425
  inp = inp.strip("\n")
426
  return "\n".join(["%s。" % item for item in inp.strip("。").split("。")])
427
+
428
+ def scan_audio_files(folder_path):
429
+ """ 扫描指定文件夹获取音频文件列表 """
430
+ return [f for f in os.listdir(folder_path) if f.endswith('.wav')]
431
+
432
+ def load_audio_text_mappings(folder_path, list_file_name):
433
+ text_to_audio_mappings = {}
434
+ audio_to_text_mappings = {}
435
+ with open(os.path.join(folder_path, list_file_name), 'r', encoding='utf-8') as file:
436
+ for line in file:
437
+ parts = line.strip().split('|')
438
+ if len(parts) >= 4:
439
+ audio_file_name = parts[0]
440
+ text = parts[3]
441
+ audio_file_path = os.path.join(folder_path, audio_file_name)
442
+ text_to_audio_mappings[text] = audio_file_path
443
+ audio_to_text_mappings[audio_file_path] = text
444
+ return text_to_audio_mappings, audio_to_text_mappings
445
+
446
+ audio_folder_path = 'audio/Taffy'
447
+ text_to_audio_mappings, audio_to_text_mappings = load_audio_text_mappings(audio_folder_path, 'Taffy.list')
 
 
 
 
 
 
 
 
 
448
 
449
  with gr.Blocks(title="GPT-SoVITS WebUI") as app:
450
+ gr.Markdown(value="""
451
+ # <center>【AI塔菲】在线语音生成(GPT-SoVITS)\n
452
+
453
+ ### <center>模型作者:Xz乔希 https://space.bilibili.com/5859321\n
454
+ ### <center>GPT-SoVITS在线合集:https://www.modelscope.cn/studios/xzjosh/GPT-SoVITS\n
455
+ ### <center>数据集下载:https://huggingface.co/datasets/XzJosh/audiodataset\n
456
+ ### <center>声音归属:永雏塔菲 https://space.bilibili.com/1265680561\n
457
+ ### <center>GPT-SoVITS项目:https://github.com/RVC-Boss/GPT-SoVITS\n
458
+ ### <center>使用本模型请严格遵守法律法规!发布二创作品请标注本项目作者及链接、作品使用GPT-SoVITS AI生成!\n
459
+ ### <center>⚠️在线端不稳定且生成速度较慢,强烈建议下载模型本地推理!\n
460
+ """)
461
+ # with gr.Tabs():
462
+ # with gr.TabItem(i18n("伴奏人声分离&去混响&去回声")):
463
  with gr.Group():
464
+ gr.Markdown(value="*参考音频选择(必选)")
 
 
 
 
 
 
 
 
465
  with gr.Row():
466
+ audio_select = gr.Dropdown(label="选择参考音频(不建议选较长的)", choices=list(text_to_audio_mappings.keys()))
467
+ ref_audio = gr.Audio(label="参考音频试听")
468
+ ref_text = gr.Textbox(label="参考音频文本")
469
+
470
+ # 定义更新参考文本的函数
471
+ def update_ref_text_and_audio(selected_text):
472
+ audio_path = text_to_audio_mappings.get(selected_text, "")
473
+ return selected_text, audio_path
474
+
475
+ # 绑定下拉菜单的变化到更新函数
476
+ audio_select.change(update_ref_text_and_audio, [audio_select], [ref_text, ref_audio])
477
+
478
+ # 其他 Gradio 组件和功能
479
+ prompt_language = gr.Dropdown(
480
+ label="参考音频语种", choices=["中文", "英文", "日文"], value="中文"
481
+ )
482
+ gr.Markdown(value="*请填写需要合成的目标文本")
483
  with gr.Row():
484
+ text = gr.Textbox(label="需要合成的文本", value="")
485
  text_language = gr.Dropdown(
486
+ label="需要合成的语种", choices=["中文", "英文", "日文"], value="中文"
 
 
 
 
 
 
487
  )
488
+ inference_button = gr.Button("合成语音", variant="primary")
489
+ output = gr.Audio(label="输出的语音")
 
490
  inference_button.click(
491
  get_tts_wav,
492
+ [audio_select, ref_text, prompt_language, text, text_language],
493
  [output],
494
  )
495
 
496
+
497
+ gr.Markdown(value="文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。")
498
+ with gr.Row():
499
+ text_inp = gr.Textbox(label="需要合成的切分前文本", value="")
500
+ button1 = gr.Button("凑五句一切", variant="primary")
501
+ button2 = gr.Button("凑50字一切", variant="primary")
502
+ button3 = gr.Button("按中文句号。切", variant="primary")
503
+ text_opt = gr.Textbox(label="切分后文本", value="")
504
+ button1.click(cut1, [text_inp], [text_opt])
505
+ button2.click(cut2, [text_inp], [text_opt])
506
+ button3.click(cut3, [text_inp], [text_opt])
507
+
508
+ app.queue(max_size=10)
509
+ app.launch(inbrowser=True)