Gregniuki commited on
Commit
5863c7f
·
verified ·
1 Parent(s): 2b71199

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -6
app.py CHANGED
@@ -182,10 +182,12 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
182
  zh_pause_punc = r"。,、;:?!"
183
  ref_text_len = len(ref_text.encode('utf-8')) + 3 * len(re.findall(zh_pause_punc, ref_text))
184
  gen_text_len = len(gen_text.encode('utf-8')) + 3 * len(re.findall(zh_pause_punc, gen_text))
185
- duration = ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len / speed)
186
- print(f"Duration: {duration} seconds")
187
- duration = min(5000, max(300, int(133 * gen_text_len / (speed * 10))))
188
- print(f"Duration: {duration} seconds")
 
 
189
 
190
  # inference
191
  with torch.inference_mode():
@@ -738,9 +740,9 @@ This is a local web UI for F5 TTS with advanced batch processing support. This a
738
 
739
  The checkpoint support Polish English and German.
740
 
741
- Generations using CPU takes usually 2-3 minutes
742
 
743
- If you're having issues, try converting your reference audio to WAV or MP3, clipping it to 15s, and shortening your prompt.
744
 
745
  **NOTE: Reference text will be automatically transcribed with Whisper if not provided. For best results, keep your reference clips short (<15s). Ensure the audio is fully uploaded before generating.**
746
  """
 
182
  zh_pause_punc = r"。,、;:?!"
183
  ref_text_len = len(ref_text.encode('utf-8')) + 3 * len(re.findall(zh_pause_punc, ref_text))
184
  gen_text_len = len(gen_text.encode('utf-8')) + 3 * len(re.findall(zh_pause_punc, gen_text))
185
+ if len(ref_text) >= 1:
186
+ duration = ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len / speed)
187
+ print(f"Duration: {duration} seconds")
188
+ else:
189
+ duration = min(5000, max(300, int(133 * gen_text_len / (speed * 10))))
190
+ print(f"Duration: {duration} seconds")
191
 
192
  # inference
193
  with torch.inference_mode():
 
740
 
741
  The checkpoint support Polish English and German.
742
 
743
+ Generations using CPU takes usually 2-3 minutes using 8 step inferece.
744
 
745
+ If you're having issues, try converting your reference audio to WAV or MP3, clipping it to 5s, and shortening your prompt.
746
 
747
  **NOTE: Reference text will be automatically transcribed with Whisper if not provided. For best results, keep your reference clips short (<15s). Ensure the audio is fully uploaded before generating.**
748
  """