litagin commited on
Commit
76e50e0
โ€ข
1 Parent(s): 6c6be1a

Improve message

Browse files
Files changed (1) hide show
  1. app.py +36 -29
app.py CHANGED
@@ -145,7 +145,8 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language)
145
  print(f"text_language: {text_language}")
146
 
147
  if len(prompt_text) > 100 or len(text) > 100:
148
- raise ValueError("Input text is limited to 100 characters.")
 
149
  t0 = ttime()
150
  prompt_text = prompt_text.strip("\n")
151
  prompt_language, text = prompt_language, text.strip("\n")
@@ -153,7 +154,8 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language)
153
  wav16k, _ = librosa.load(ref_wav_path, sr=16000) # ๆดพ่’™
154
  # length of wav16k in sec should be in 60s
155
  if len(wav16k) > 16000 * 60:
156
- raise ValueError("Input audio is limited to 60 seconds.")
 
157
  wav16k = wav16k[: int(hps.data.sampling_rate * max_sec)]
158
  wav16k = torch.from_numpy(wav16k)
159
  if is_half == True:
@@ -233,9 +235,12 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language)
233
  audio_opt.append(zero_wav)
234
  t4 = ttime()
235
  print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
236
- yield hps.data.sampling_rate, (np.concatenate(audio_opt, 0) * 32768).astype(
237
- np.int16
238
- )
 
 
 
239
 
240
 
241
  initial_md = """
@@ -258,30 +263,32 @@ If you do not agree with these terms and conditions, you may not use or referenc
258
 
259
  with gr.Blocks(title="GPT-SoVITS Zero-shot TTS Demo") as app:
260
  gr.Markdown(initial_md)
261
- with gr.Group():
262
- gr.Markdown(value="*Upload reference audio")
263
- with gr.Row():
264
- inp_ref = gr.Audio(label="Reference audio", type="filepath")
265
- prompt_text = gr.Textbox(label="Transcription of reference audio")
266
- prompt_language = gr.Dropdown(
267
- label="Language of reference audio",
268
- choices=["Chinese", "English", "Japanese"],
269
- value="Japanese",
270
- )
271
- gr.Markdown(value="*Text to synthesize")
272
- with gr.Row():
273
- text = gr.Textbox(label="Text to synthesize")
274
- text_language = gr.Dropdown(
275
- label="Language of text",
276
- choices=["Chinese", "English", "Japanese"],
277
- value="Japanese",
278
- )
279
- inference_button = gr.Button("Synthesize", variant="primary")
280
- output = gr.Audio(label="Result")
281
- inference_button.click(
282
- get_tts_wav,
283
- [inp_ref, prompt_text, prompt_language, text, text_language],
284
- [output],
285
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
 
 
287
  app.launch(inbrowser=True)
 
145
  print(f"text_language: {text_language}")
146
 
147
  if len(prompt_text) > 100 or len(text) > 100:
148
+ print("Input text is limited to 100 characters.")
149
+ return "Input text is limited to 100 characters.", None
150
  t0 = ttime()
151
  prompt_text = prompt_text.strip("\n")
152
  prompt_language, text = prompt_language, text.strip("\n")
 
154
  wav16k, _ = librosa.load(ref_wav_path, sr=16000) # ๆดพ่’™
155
  # length of wav16k in sec should be in 60s
156
  if len(wav16k) > 16000 * 60:
157
+ print("Input audio is limited to 60 seconds.")
158
+ return "Input audio is limited to 60 seconds.", None
159
  wav16k = wav16k[: int(hps.data.sampling_rate * max_sec)]
160
  wav16k = torch.from_numpy(wav16k)
161
  if is_half == True:
 
235
  audio_opt.append(zero_wav)
236
  t4 = ttime()
237
  print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
238
+ return "Success! time: %.3f\t%.3f\t%.3f\t%.3f" % (
239
+ t1 - t0,
240
+ t2 - t1,
241
+ t3 - t2,
242
+ t4 - t3,
243
+ ), (hps.data.sampling_rate, (np.concatenate(audio_opt, 0) * 32768).astype(np.int16))
244
 
245
 
246
  initial_md = """
 
263
 
264
  with gr.Blocks(title="GPT-SoVITS Zero-shot TTS Demo") as app:
265
  gr.Markdown(initial_md)
266
+ gr.Markdown("## Upload reference audio")
267
+ with gr.Row():
268
+ inp_ref = gr.Audio(label="Reference audio", type="filepath")
269
+ prompt_text = gr.Textbox(label="Transcription of reference audio")
270
+ prompt_language = gr.Dropdown(
271
+ label="Language of reference audio",
272
+ choices=["Chinese", "English", "Japanese"],
273
+ value="Japanese",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
  )
275
+ gr.Markdown("## Text to synthesize")
276
+ with gr.Row():
277
+ text = gr.Textbox(label="Text to synthesize")
278
+ text_language = gr.Dropdown(
279
+ label="Language of text",
280
+ choices=["Chinese", "English", "Japanese"],
281
+ value="Japanese",
282
+ )
283
+ inference_button = gr.Button("Synthesize", variant="primary")
284
+ with gr.Column():
285
+ info = gr.Textbox(label="Info")
286
+ output = gr.Audio(label="Result")
287
+ inference_button.click(
288
+ get_tts_wav,
289
+ [inp_ref, prompt_text, prompt_language, text, text_language],
290
+ [info, output],
291
+ )
292
 
293
+ app.queue(max_size=10)
294
  app.launch(inbrowser=True)