Aki004 commited on
Commit
7652637
1 Parent(s): a7d5ab2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -7
app.py CHANGED
@@ -25,8 +25,31 @@ def tts_get_voices_list():
25
 
26
  return voices
27
 
28
- def infer(txt, voice):
29
- tts = asyncio.run(edge_tts.Communicate(txt, voice).save('audio.mp3'))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  audio, sr = librosa.load('audio.mp3', sr=16000, mono=True)
31
  raw_path = io.BytesIO()
32
  soundfile.write(raw_path, audio, 16000, format="wav")
@@ -35,6 +58,12 @@ def infer(txt, voice):
35
  out_audio, out_sr = model.infer('speaker0', 0, raw_path, auto_predict_f0 = True,)
36
  return (44100, out_audio.cpu().numpy())
37
 
 
 
 
 
 
 
38
  if __name__ == '__main__':
39
  parser = argparse.ArgumentParser()
40
  parser.add_argument('--device', type=str, default='cpu')
@@ -50,11 +79,15 @@ if __name__ == '__main__':
50
  cover = gr.Markdown('<div align="center">'
51
  f'<img style="width:auto;height:300px;" src="file/Herta-Svc/herta.png">'
52
  '</div>')
53
- tts_text = gr.Textbox(label="TTS text (100 words limitation)", visible = True)
54
- tts_voice = gr.Dropdown(choices= tts_get_voices_list(), visible = True)
 
 
 
55
  audio_output = gr.Audio(label="Output Audio")
56
  btn_submit = gr.Button("Generate")
57
 
58
-
59
- btn_submit.click(infer, [tts_text, tts_voice], [audio_output])
60
- app.queue(concurrency_count=1, api_open=args.api).launch(share=args.share)
 
 
25
 
26
  return voices
27
 
28
+ def infer(txt, tts_voice, input_audio, predict_f0, audio_mode):
29
+ if audio_mode:
30
+ if input_audio is None:
31
+ return 'Please upload your audio file'
32
+
33
+ sampling_rate, audio = input_audio
34
+ duration = audio.shape[0] / sampling_rate
35
+
36
+ if duration > 30:
37
+ return 'The audio file is too long, Please upload audio file that less than 30 seconds'
38
+
39
+ audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
40
+ if len(audio.shape) > 1:
41
+ audio = librosa.to_mono(audio.transpose(1, 0))
42
+ if sampling_rate != 16000:
43
+ audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
44
+
45
+ raw_path = io.BytesIO()
46
+ soundfile.write(raw_path, audio, 16000, format="wav")
47
+ raw_path.seek(0)
48
+ model = Svc(fr"Herta-Svc/G_10000.pth", f"Herta-Svc/config.json", device = 'cpu')
49
+ out_audio, out_sr = model.infer('speaker0', 0, raw_path, auto_predict_f0 = predict_f0,)
50
+ return (44100, out_audio.cpu().numpy())
51
+
52
+ tts = asyncio.run(edge_tts.Communicate(txt, tts_voice).save('audio.mp3'))
53
  audio, sr = librosa.load('audio.mp3', sr=16000, mono=True)
54
  raw_path = io.BytesIO()
55
  soundfile.write(raw_path, audio, 16000, format="wav")
 
58
  out_audio, out_sr = model.infer('speaker0', 0, raw_path, auto_predict_f0 = True,)
59
  return (44100, out_audio.cpu().numpy())
60
 
61
+ def change_to_audio_mode(audio_mode):
62
+ if audio_mode:
63
+ return gr.Audio.update(visible = True), gr.Textbox.update(visible= False), gr.Dropdown.update(visible = False), gr.Checkbox.update(value = True)
64
+ else:
65
+ return gr.Audio.update(visible = False), gr.Textbox.update(visible= True), gr.Dropdown.update(visible = True), gr.Checkbox.update(value = False)
66
+
67
  if __name__ == '__main__':
68
  parser = argparse.ArgumentParser()
69
  parser.add_argument('--device', type=str, default='cpu')
 
79
  cover = gr.Markdown('<div align="center">'
80
  f'<img style="width:auto;height:300px;" src="file/Herta-Svc/herta.png">'
81
  '</div>')
82
+ tts_text = gr.Textbox(label="TTS text (100 words limitation)")
83
+ audio_input = gr.Audio(label = 'Please upload audio file that less than 30 seconds', visible = False)
84
+ tts_voice = gr.Dropdown(choices= tts_get_voices_list())
85
+ predict_f0 = gr.Checkbox(label = 'Auto predict F0', value = False)
86
+ audio_mode = gr.Checkbox(label = 'Upload audio instead', value = False)
87
  audio_output = gr.Audio(label="Output Audio")
88
  btn_submit = gr.Button("Generate")
89
 
90
+ btn_submit.click(infer, [tts_text, tts_voice, audio_input, predict_f0, audio_mode], [audio_output])
91
+ audio_mode.change(change_to_audio_mode, audio_mode, [audio_input, tts_text, tts_voice])
92
+
93
+ app.queue(concurrency_count=1, api_open=args.api).launch(share=args.share)