Commit
·
c25a639
1
Parent(s):
d289ff1
Update app.py
Browse files
app.py
CHANGED
@@ -69,6 +69,9 @@ def extrac(text):
|
|
69 |
text = re.sub("<[^>]*>","",text)
|
70 |
result_list = re.split(r'\n', text)
|
71 |
final_list = []
|
|
|
|
|
|
|
72 |
for i in result_list:
|
73 |
if is_english(i):
|
74 |
i = romajitable.to_kana(i).katakana
|
@@ -153,15 +156,21 @@ def get_text(text,hps_ms):
|
|
153 |
text_norm = torch.LongTensor(text_norm)
|
154 |
return text_norm
|
155 |
|
156 |
-
def vc_fn(original_speaker, target_speaker, record_audio, upload_audio):
|
157 |
input_audio = record_audio if record_audio is not None else upload_audio
|
158 |
-
if input_audio is None:
|
159 |
-
return "You need to record or upload an audio", None
|
160 |
-
sampling_rate, audio = input_audio
|
161 |
original_speaker_id = selection(original_speaker)
|
162 |
target_speaker_id = selection(target_speaker)
|
163 |
-
|
164 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
if len(audio.shape) > 1:
|
166 |
audio = librosa.to_mono(audio.transpose(1, 0))
|
167 |
if sampling_rate != hps.data.sampling_rate:
|
@@ -334,16 +343,26 @@ def selection(speaker):
|
|
334 |
else:
|
335 |
return 0
|
336 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
337 |
def create_tts_fn(net_g,hps,speaker_id):
|
338 |
speaker_id = int(speaker_id)
|
339 |
def tts_fn(is_gpt,api_key,is_audio,audiopath,repeat_time,text, language, extract, n_scale= 0.667,n_scale_w = 0.8, l_scale = 1 ):
|
|
|
340 |
repeat_ime = int(repeat_time)
|
341 |
if is_gpt:
|
342 |
openai.api_key = api_key
|
343 |
text,messages = chatgpt(text)
|
344 |
htm = to_html(messages)
|
345 |
else:
|
346 |
-
|
|
|
|
|
347 |
if not extract:
|
348 |
t1 = time.time()
|
349 |
stn_tst = get_text(sle(language,text),hps)
|
@@ -417,7 +436,7 @@ if __name__ == '__main__':
|
|
417 |
hps = utils.get_hparams_from_file('checkpoints/tmp/config.json')
|
418 |
dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
419 |
models = []
|
420 |
-
schools = ["Nijigasaki","ShojoKageki","
|
421 |
lan = ["中文","日文","自动","手动"]
|
422 |
with open("checkpoints/info.json", "r", encoding="utf-8") as f:
|
423 |
models_info = json.load(f)
|
@@ -444,12 +463,13 @@ if __name__ == '__main__':
|
|
444 |
name = speakers[j]["name"]
|
445 |
content.append((sid, name, title, example, create_tts_fn(net_g,hps,sid)))
|
446 |
models.append(content)
|
447 |
-
idols = ["c1","c2","高咲侑","歩夢","かすみ","しずく","果林","愛","彼方","せつ菜","璃奈","栞子","エマ","ランジュ","ミア","華恋","まひる","なな","クロディーヌ","ひかり",'純那',"香子","真矢","双葉","ミチル","メイファン","やちよ","晶","いちえ","ゆゆ子","塁","珠緒","あるる","ララフィン","美空","静羽","あるる"]
|
448 |
with gr.Blocks() as app:
|
449 |
with gr.Tabs():
|
450 |
for i in schools:
|
451 |
with gr.TabItem(i):
|
|
|
452 |
for (sid, name, title, example, tts_fn) in models[schools.index(i)]:
|
|
|
453 |
with gr.TabItem(name):
|
454 |
with gr.Column():
|
455 |
with gr.Row():
|
@@ -471,35 +491,45 @@ if __name__ == '__main__':
|
|
471 |
input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.6)
|
472 |
input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.668)
|
473 |
input6 = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1)
|
474 |
-
with gr.Accordion(label="Advanced Setting(GPT3.5
|
475 |
input3 = gr.Checkbox(value=False, label="长句切割(小说合成)")
|
|
|
|
|
476 |
output2 = gr.outputs.File(label="字幕文件:subtitles.srt")
|
477 |
api_input1 = gr.Checkbox(value=False, label="接入chatgpt")
|
478 |
api_input2 = gr.TextArea(label="api-key",lines=1,value = '见 https://openai.com/blog/openai-api')
|
479 |
audio_input1 = gr.Checkbox(value=False, label="修改音频路径(live2d)")
|
480 |
audio_input2 = gr.TextArea(label="音频路径",lines=1,value = '#参考 D:/app_develop/live2d_whole/2010002/sounds/temp.wav')
|
481 |
audio_input3 = gr.Dropdown(label="重复生成次数", choices=list(range(101)), value='0', interactive=True)
|
|
|
482 |
btnVC.click(tts_fn, inputs=[api_input1,api_input2,audio_input1,audio_input2,audio_input3,input1,input2,input3,input4,input5,input6], outputs=[output1,output2,output_UI])
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
|
487 |
-
|
488 |
-
|
489 |
-
|
490 |
-
|
491 |
-
|
492 |
-
|
493 |
-
|
494 |
-
|
495 |
-
|
496 |
-
|
497 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
498 |
with gr.Tab("说明"):
|
499 |
gr.Markdown(
|
500 |
"### <center> 请不要生成会对个人以及企划造成侵害的内容,自觉遵守相关法律,静止商业使用或让他人产生困扰\n"
|
501 |
"<div align='center'>从左到右分别是虹团,少歌中文特化版,以及五校混合版。这三个均为不同的模型,效果也有差异</div>\n"
|
502 |
"<div align='center'>因为我会时不时地更新模型,所以会碰到平台抽风问题,大部分情���下一天就能恢复了。</div>\n"
|
503 |
-
'<div align="center"><a>参数说明:这个十分玄学,我还没找到最合适的,如果效果不佳可以将噪声比例和噪声偏差调节至0
|
504 |
'<div align="center"><a>建议只在平台上体验最基础的功能,强烈建议将该仓库克隆至本地或者于colab运行 main.py或app.py</div>')
|
505 |
app.launch()
|
|
|
69 |
text = re.sub("<[^>]*>","",text)
|
70 |
result_list = re.split(r'\n', text)
|
71 |
final_list = []
|
72 |
+
if not torch.cuda.is_available():
|
73 |
+
if len(final_list) > 10:
|
74 |
+
return ['对不起,做不到']
|
75 |
for i in result_list:
|
76 |
if is_english(i):
|
77 |
i = romajitable.to_kana(i).katakana
|
|
|
156 |
text_norm = torch.LongTensor(text_norm)
|
157 |
return text_norm
|
158 |
|
159 |
+
def vc_fn(text,language,n_scale,n_scale_w,l_scale,original_speaker, target_speaker, record_audio, upload_audio):
|
160 |
input_audio = record_audio if record_audio is not None else upload_audio
|
|
|
|
|
|
|
161 |
original_speaker_id = selection(original_speaker)
|
162 |
target_speaker_id = selection(target_speaker)
|
163 |
+
if input_audio is None:
|
164 |
+
stn_tst = get_text(sle(language,text),hps)
|
165 |
+
with torch.no_grad():
|
166 |
+
x_tst = stn_tst.unsqueeze(0).to(dev)
|
167 |
+
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
|
168 |
+
sid = torch.LongTensor([original_speaker_id]).to(dev)
|
169 |
+
audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
|
170 |
+
sampling_rate = hps.data.sampling_rate
|
171 |
+
else:
|
172 |
+
sampling_rate, audio = input_audio
|
173 |
+
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
|
174 |
if len(audio.shape) > 1:
|
175 |
audio = librosa.to_mono(audio.transpose(1, 0))
|
176 |
if sampling_rate != hps.data.sampling_rate:
|
|
|
343 |
else:
|
344 |
return 0
|
345 |
|
346 |
+
def check_text(input):
|
347 |
+
if isinstance(input, str):
|
348 |
+
return input
|
349 |
+
else:
|
350 |
+
with open(input.name, "r", encoding="utf-8") as f:
|
351 |
+
return f.read()
|
352 |
+
|
353 |
def create_tts_fn(net_g,hps,speaker_id):
|
354 |
speaker_id = int(speaker_id)
|
355 |
def tts_fn(is_gpt,api_key,is_audio,audiopath,repeat_time,text, language, extract, n_scale= 0.667,n_scale_w = 0.8, l_scale = 1 ):
|
356 |
+
text = check_text(text)
|
357 |
repeat_ime = int(repeat_time)
|
358 |
if is_gpt:
|
359 |
openai.api_key = api_key
|
360 |
text,messages = chatgpt(text)
|
361 |
htm = to_html(messages)
|
362 |
else:
|
363 |
+
messages = []
|
364 |
+
messages.append({"role": "assistant", "content": text})
|
365 |
+
htm = to_html(messages)
|
366 |
if not extract:
|
367 |
t1 = time.time()
|
368 |
stn_tst = get_text(sle(language,text),hps)
|
|
|
436 |
hps = utils.get_hparams_from_file('checkpoints/tmp/config.json')
|
437 |
dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
438 |
models = []
|
439 |
+
schools = ["ShojoKageki-Nijigasaki","ShojoKageki","Nijigasaki"]
|
440 |
lan = ["中文","日文","自动","手动"]
|
441 |
with open("checkpoints/info.json", "r", encoding="utf-8") as f:
|
442 |
models_info = json.load(f)
|
|
|
463 |
name = speakers[j]["name"]
|
464 |
content.append((sid, name, title, example, create_tts_fn(net_g,hps,sid)))
|
465 |
models.append(content)
|
|
|
466 |
with gr.Blocks() as app:
|
467 |
with gr.Tabs():
|
468 |
for i in schools:
|
469 |
with gr.TabItem(i):
|
470 |
+
idols = ["派蒙"]
|
471 |
for (sid, name, title, example, tts_fn) in models[schools.index(i)]:
|
472 |
+
idols.append(name)
|
473 |
with gr.TabItem(name):
|
474 |
with gr.Column():
|
475 |
with gr.Row():
|
|
|
491 |
input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.6)
|
492 |
input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.668)
|
493 |
input6 = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1)
|
494 |
+
with gr.Accordion(label="Advanced Setting(GPT3.5接口+小说合成,建议克隆本仓库后运行main.py)", open=False):
|
495 |
input3 = gr.Checkbox(value=False, label="长句切割(小说合成)")
|
496 |
+
inputxt = gr.File(label="Text")
|
497 |
+
btnbook = gr.Button("小说合成")
|
498 |
output2 = gr.outputs.File(label="字幕文件:subtitles.srt")
|
499 |
api_input1 = gr.Checkbox(value=False, label="接入chatgpt")
|
500 |
api_input2 = gr.TextArea(label="api-key",lines=1,value = '见 https://openai.com/blog/openai-api')
|
501 |
audio_input1 = gr.Checkbox(value=False, label="修改音频路径(live2d)")
|
502 |
audio_input2 = gr.TextArea(label="音频路径",lines=1,value = '#参考 D:/app_develop/live2d_whole/2010002/sounds/temp.wav')
|
503 |
audio_input3 = gr.Dropdown(label="重复生成次数", choices=list(range(101)), value='0', interactive=True)
|
504 |
+
btnbook.click(tts_fn, inputs=[api_input1,api_input2,audio_input1,audio_input2,audio_input3,inputxt,input2,input3,input4,input5,input6], outputs=[output1,output2,output_UI])
|
505 |
btnVC.click(tts_fn, inputs=[api_input1,api_input2,audio_input1,audio_input2,audio_input3,input1,input2,input3,input4,input5,input6], outputs=[output1,output2,output_UI])
|
506 |
+
with gr.Tab("Voice Conversion(就是sovits的原理)"):
|
507 |
+
gr.Markdown("""
|
508 |
+
声线转化,使用模型中的说话人作为音源时效果更佳
|
509 |
+
""")
|
510 |
+
with gr.Column():
|
511 |
+
with gr.Accordion(label="方法1:录制或上传声音,可进行歌声合成", open=False):
|
512 |
+
record_audio = gr.Audio(label="record your voice", source="microphone")
|
513 |
+
upload_audio = gr.Audio(label="or upload audio here", source="upload")
|
514 |
+
with gr.Accordion(label="方法2:由原说话人先进行tts后套娃,适用于合成中文等特殊场景", open=True):
|
515 |
+
text = gr.TextArea(label="Text", value='由源说话人进行语音转化',lines = 1)
|
516 |
+
language = gr.Dropdown(label="Language", choices=lan, value="自动", interactive=True)
|
517 |
+
n_scale = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.6)
|
518 |
+
n_scale_w = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.668)
|
519 |
+
l_scale = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1.1)
|
520 |
+
source_speaker = gr.Dropdown(choices=idols, value=idols[-2], label="source speaker")
|
521 |
+
target_speaker = gr.Dropdown(choices=idols, value=idols[-3], label="target speaker")
|
522 |
+
with gr.Column():
|
523 |
+
message_box = gr.Textbox(label="Message")
|
524 |
+
converted_audio = gr.Audio(label='converted audio')
|
525 |
+
btn = gr.Button("Convert!")
|
526 |
+
btn.click(vc_fn, inputs=[text,language,n_scale,n_scale_w,l_scale,source_speaker, target_speaker, record_audio, upload_audio],
|
527 |
+
outputs=[message_box, converted_audio])
|
528 |
with gr.Tab("说明"):
|
529 |
gr.Markdown(
|
530 |
"### <center> 请不要生成会对个人以及企划造成侵害的内容,自觉遵守相关法律,静止商业使用或让他人产生困扰\n"
|
531 |
"<div align='center'>从左到右分别是虹团,少歌中文特化版,以及五校混合版。这三个均为不同的模型,效果也有差异</div>\n"
|
532 |
"<div align='center'>因为我会时不时地更新模型,所以会碰到平台抽风问题,大部分情���下一天就能恢复了。</div>\n"
|
533 |
+
'<div align="center"><a>参数说明:这个十分玄学,我还没找到最合适的,如果效果不佳可以将噪声比例和噪声偏差调节至0,这回完全随机化音频源。按照经验,合成日语时也可以将噪声比例调节至0.2-0.3区间,语调会正常一些。duration代表整体语速,可视情况调至1.1或1.2</div>'
|
534 |
'<div align="center"><a>建议只在平台上体验最基础的功能,强烈建议将该仓库克隆至本地或者于colab运行 main.py或app.py</div>')
|
535 |
app.launch()
|