Mahiruoshi commited on
Commit
6c297e8
1 Parent(s): c3497ff

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +63 -42
main.py CHANGED
@@ -45,9 +45,8 @@ def extrac(text):
45
  i = romajitable.to_kana(i).katakana
46
  i = i.replace('\n','').replace(' ','')
47
  #Current length of single sentence: 20
48
- '''
49
  if len(i)>1:
50
- if len(i) > 20:
51
  try:
52
  cur_list = re.split(r'。|!', i)
53
  for i in cur_list:
@@ -59,6 +58,7 @@ def extrac(text):
59
  final_list.append(i)
60
  '''
61
  final_list.append(i)
 
62
  final_list = [x for x in final_list if x != '']
63
  print(final_list)
64
  return final_list
@@ -121,7 +121,7 @@ def sle(language,text):
121
  return text
122
 
123
  def get_text(text,hps_ms):
124
- text_norm = text_to_sequence(text,hps_ms.data.text_cleaners)
125
  if hps_ms.data.add_blank:
126
  text_norm = commons.intersperse(text_norm, 0)
127
  text_norm = torch.LongTensor(text_norm)
@@ -129,9 +129,11 @@ def get_text(text,hps_ms):
129
 
130
  def create_tts_fn(net_g,hps,speaker_id):
131
  speaker_id = int(speaker_id)
132
- def tts_fn(history,is_gpt,api_key,is_audio,audiopath,repeat_time,text, language, extract, n_scale= 0.667,n_scale_w = 0.8, l_scale = 1 ):
133
  text = check_text(text)
134
  repeat_time = int(repeat_time)
 
 
135
  if is_gpt:
136
  openai.api_key = api_key
137
  text = chatgpt(text)
@@ -166,40 +168,56 @@ def create_tts_fn(net_g,hps,speaker_id):
166
  for i in b:
167
  text = text.replace(i,'>')
168
  final_list = extrac(text.replace('“','').replace('”',''))
169
- audio_fin = []
170
- c = 0
171
- t = datetime.timedelta(seconds=0)
172
- f1 = open("subtitles.srt",'w',encoding='utf-8')
173
- for sentence in final_list:
174
- c +=1
175
- stn_tst = get_text(sle(language,sentence),hps)
176
- with torch.no_grad():
177
- x_tst = stn_tst.unsqueeze(0).to(dev)
178
- x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
179
- sid = torch.LongTensor([speaker_id]).to(dev)
180
- t1 = time.time()
181
- audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
182
- t2 = time.time()
183
- spending_time = "第"+str(c)+"句的推理时间为:"+str(t2-t1)+"s"
184
- print(spending_time)
185
- time_start = str(t).split(".")[0] + "," + str(t.microseconds)[:3]
186
- last_time = datetime.timedelta(seconds=len(audio)/float(22050))
187
- t+=last_time
188
- time_end = str(t).split(".")[0] + "," + str(t.microseconds)[:3]
189
- print(time_end)
190
- f1.write(str(c-1)+'\n'+time_start+' --> '+time_end+'\n'+sentence+'\n\n')
191
- audio_fin.append(audio)
192
- try:
193
- write(audiopath + '.wav',22050,np.concatenate(audio_fin))
194
- if is_audio:
195
- for i in range(repeat_time):
196
- cmd = 'ffmpeg -y -i ' + audiopath + '.wav' + ' -ar 44100 '+ audiopath.replace('temp','temp'+str(i))
197
- os.system(cmd)
198
-
199
- except:
200
- pass
201
-
202
- file_path = "subtitles.srt"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  return history,file_path,(hps.data.sampling_rate, np.concatenate(audio_fin))
204
  return tts_fn
205
 
@@ -460,7 +478,6 @@ if __name__ == '__main__':
460
  output1 = gr.Audio(label="采样率22050")
461
  with gr.Accordion(label="Setting", open=False):
462
  input2 = gr.Dropdown(label="Language", choices=lan, value="自动", interactive=True)
463
- input3 = gr.Checkbox(value=False, label="长句切割(小说合成)")
464
  input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.6)
465
  input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.668)
466
  input6 = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1)
@@ -468,17 +485,21 @@ if __name__ == '__main__':
468
  audio_input3 = gr.Dropdown(label="重复次数", choices=list(range(101)), value='0', interactive=True)
469
  api_input1 = gr.Checkbox(value=False, label="接入chatgpt")
470
  api_input2 = gr.TextArea(label="api-key",lines=1,value = '懂得都懂')
 
471
  output2 = gr.outputs.File(label="字幕文件:subtitles.srt")
472
- audio_input1 = gr.Checkbox(value=False, label="修改音频路径(live2d)")
473
  audio_input2 = gr.TextArea(label="音频路径",lines=1,value = 'D:/path/to/live2d/sounds/temp.wav')
474
  input3 = gr.Checkbox(value=False, label="长句切割(小说合成)")
475
  inputxt = gr.File(label="Text")
 
 
 
476
  btnbook = gr.Button("小说合成")
477
  btnVC.click(bot, inputs = [chatbot,input1], outputs = [chatbot]).then(
478
- tts_fn, inputs=[chatbot,api_input1,api_input2,audio_input1,audio_input2,audio_input3,input1,input2,input3,input4,input5,input6], outputs=[chatbot,output2,output1]
479
  )
480
  btnbook.click(bot, inputs = [chatbot,inputxt], outputs = [chatbot]).then(
481
- tts_fn, inputs=[chatbot,api_input1,api_input2,audio_input1,audio_input2,audio_input3,inputxt,input2,input3,input4,input5,input6], outputs=[chatbot,output2,output1]
482
  )
483
  with gr.Tab("Voice Conversion(类似sovits)"):
484
  gr.Markdown("""
 
45
  i = romajitable.to_kana(i).katakana
46
  i = i.replace('\n','').replace(' ','')
47
  #Current length of single sentence: 20
 
48
  if len(i)>1:
49
+ if len(i) > 50:
50
  try:
51
  cur_list = re.split(r'。|!', i)
52
  for i in cur_list:
 
58
  final_list.append(i)
59
  '''
60
  final_list.append(i)
61
+ '''
62
  final_list = [x for x in final_list if x != '']
63
  print(final_list)
64
  return final_list
 
121
  return text
122
 
123
  def get_text(text,hps_ms):
124
+ text_norm = text_to_sequence(text,hps_ms.symbols,hps_ms.data.text_cleaners)
125
  if hps_ms.data.add_blank:
126
  text_norm = commons.intersperse(text_norm, 0)
127
  text_norm = torch.LongTensor(text_norm)
 
129
 
130
  def create_tts_fn(net_g,hps,speaker_id):
131
  speaker_id = int(speaker_id)
132
+ def tts_fn(is_transfer,original_speaker, target_speaker,history,is_gpt,api_key,is_audio,audiopath,repeat_time,text, language, extract, n_scale= 0.667,n_scale_w = 0.8, l_scale = 1 ):
133
  text = check_text(text)
134
  repeat_time = int(repeat_time)
135
+ original_speaker_id = selection(original_speaker)
136
+ target_speaker_id = selection(target_speaker)
137
  if is_gpt:
138
  openai.api_key = api_key
139
  text = chatgpt(text)
 
168
  for i in b:
169
  text = text.replace(i,'>')
170
  final_list = extrac(text.replace('“','').replace('”',''))
171
+ split_list = []
172
+ while len(final_list) > 0:
173
+ split_list.append(final_list[:500])
174
+ final_list = final_list[500:]
175
+ c0 = 0
176
+ for lists in split_list:
177
+ audio_fin = []
178
+ t = datetime.timedelta(seconds=0)
179
+ c = 0
180
+ f1 = open(audiopath.replace('.wav',str(c0)+".srt"),'w',encoding='utf-8')
181
+ for sentence in lists:
182
+ try:
183
+ c +=1
184
+ stn_tst = get_text(sle(language,sentence),hps)
185
+ with torch.no_grad():
186
+ x_tst = stn_tst.unsqueeze(0).to(dev)
187
+ x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
188
+ sid = torch.LongTensor([original_speaker_id]).to(dev)
189
+ t1 = time.time()
190
+ audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
191
+ t2 = time.time()
192
+ spending_time = "第"+str(c)+"句的推理时间为:"+str(t2-t1)+"s"
193
+ print(spending_time)
194
+ time_start = str(t).split(".")[0] + "," + str(t.microseconds)[:3]
195
+ last_time = datetime.timedelta(seconds=len(audio)/float(22050))
196
+ t+=last_time
197
+ time_end = str(t).split(".")[0] + "," + str(t.microseconds)[:3]
198
+ print(time_end)
199
+ f1.write(str(c-1)+'\n'+time_start+' --> '+time_end+'\n'+sentence+'\n\n')
200
+ if is_transfer:
201
+ with torch.no_grad():
202
+ y = torch.FloatTensor(audio)
203
+ y = y / max(-y.min(), y.max()) / 0.99
204
+ y = y.to(dev)
205
+ y = y.unsqueeze(0)
206
+ spec = spectrogram_torch(y, hps.data.filter_length,
207
+ hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
208
+ center=False).to(dev)
209
+ spec_lengths = torch.LongTensor([spec.size(-1)]).to(dev)
210
+ sid_src = torch.LongTensor([original_speaker_id]).to(dev)
211
+ sid_tgt = torch.LongTensor([target_speaker_id]).to(dev)
212
+ audio = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
213
+ 0, 0].data.cpu().float().numpy()
214
+ del y, spec, spec_lengths, sid_src, sid_tgt
215
+ audio_fin.append(audio)
216
+ except:
217
+ pass
218
+ write(audiopath.replace('.wav',str(c0)+'.wav'),22050,np.concatenate(audio_fin))
219
+ c0 += 1
220
+ file_path = audiopath.replace('.wav',str(c0)+".srt")
221
  return history,file_path,(hps.data.sampling_rate, np.concatenate(audio_fin))
222
  return tts_fn
223
 
 
478
  output1 = gr.Audio(label="采样率22050")
479
  with gr.Accordion(label="Setting", open=False):
480
  input2 = gr.Dropdown(label="Language", choices=lan, value="自动", interactive=True)
 
481
  input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.6)
482
  input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.668)
483
  input6 = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1)
 
485
  audio_input3 = gr.Dropdown(label="重复次数", choices=list(range(101)), value='0', interactive=True)
486
  api_input1 = gr.Checkbox(value=False, label="接入chatgpt")
487
  api_input2 = gr.TextArea(label="api-key",lines=1,value = '懂得都懂')
488
+ with gr.Accordion(label="Advanced Setting", open=False):
489
  output2 = gr.outputs.File(label="字幕文件:subtitles.srt")
490
+ audio_input1 = gr.Checkbox(value=False, label="保存路径")
491
  audio_input2 = gr.TextArea(label="音频路径",lines=1,value = 'D:/path/to/live2d/sounds/temp.wav')
492
  input3 = gr.Checkbox(value=False, label="长句切割(小说合成)")
493
  inputxt = gr.File(label="Text")
494
+ is_transfer = gr.Checkbox(value=False, label="是否声线转化")
495
+ source_speaker = gr.Dropdown(choices=idols, value=name, label="source speaker")
496
+ target_speaker = gr.Dropdown(choices=idols, value=name, label="target speaker")
497
  btnbook = gr.Button("小说合成")
498
  btnVC.click(bot, inputs = [chatbot,input1], outputs = [chatbot]).then(
499
+ tts_fn, inputs=[is_transfer,source_speaker,target_speaker,chatbot,api_input1,api_input2,audio_input1,audio_input2,audio_input3,input1,input2,input3,input4,input5,input6], outputs=[chatbot,output2,output1]
500
  )
501
  btnbook.click(bot, inputs = [chatbot,inputxt], outputs = [chatbot]).then(
502
+ tts_fn, inputs=[is_transfer,source_speaker,target_speaker,chatbot,api_input1,api_input2,audio_input1,audio_input2,audio_input3,inputxt,input2,input3,input4,input5,input6], outputs=[chatbot,output2,output1]
503
  )
504
  with gr.Tab("Voice Conversion(类似sovits)"):
505
  gr.Markdown("""