PlotweaverModel commited on
Commit
e2d9fde
·
verified ·
1 Parent(s): 21d72c9

app.py updated

Browse files
Files changed (1) hide show
  1. app.py +63 -38
app.py CHANGED
@@ -68,30 +68,9 @@ LANGUAGES = {
68
  "Italian": {"code": "it", "native": "Italiano", "tier": "core"},
69
  "Arabic": {"code": "ar", "native": "Arabic", "tier": "extended"},
70
  "Dutch": {"code": "nl", "native": "Nederlands", "tier": "extended"},
71
- "Polish": {"code": "pl", "native": "Polski", "tier": "extended"},
72
- "Turkish": {"code": "tr", "native": "Turkce", "tier": "extended"},
73
- "Vietnamese": {"code": "vi", "native": "Tieng Viet", "tier": "extended"},
74
- "Thai": {"code": "th", "native": "Thai", "tier": "extended"},
75
- "Indonesian": {"code": "id", "native": "Bahasa Indonesia", "tier": "extended"},
76
- "Malay": {"code": "ms", "native": "Bahasa Melayu", "tier": "extended"},
77
  "Hindi": {"code": "hi", "native": "Hindi", "tier": "extended"},
78
- "Bengali": {"code": "bn", "native": "Bengali", "tier": "extended"},
79
  "Urdu": {"code": "ur", "native": "Urdu", "tier": "extended"},
80
- "Swedish": {"code": "sv", "native": "Svenska", "tier": "extended"},
81
- "Czech": {"code": "cs", "native": "Cestina", "tier": "extended"},
82
- "Romanian": {"code": "ro", "native": "Romana", "tier": "extended"},
83
- "Greek": {"code": "el", "native": "Greek", "tier": "extended"},
84
- "Hungarian": {"code": "hu", "native": "Magyar", "tier": "extended"},
85
- "Finnish": {"code": "fi", "native": "Suomi", "tier": "extended"},
86
- "Danish": {"code": "da", "native": "Dansk", "tier": "extended"},
87
- "Norwegian": {"code": "no", "native": "Norsk", "tier": "extended"},
88
- "Ukrainian": {"code": "uk", "native": "Ukrainian", "tier": "extended"},
89
- "Hebrew": {"code": "he", "native": "Hebrew", "tier": "extended"},
90
- "Persian": {"code": "fa", "native": "Farsi", "tier": "extended"},
91
- "Cantonese": {"code": "yue", "native": "Cantonese", "tier": "extended"},
92
- "Filipino": {"code": "fil", "native": "Filipino", "tier": "extended"},
93
  "Swahili": {"code": "sw", "native": "Kiswahili", "tier": "extended"},
94
- "Tamil": {"code": "ta", "native": "Tamil", "tier": "extended"},
95
  }
96
 
97
  VOICE_CLONE_LANGUAGES = {
@@ -101,25 +80,19 @@ VOICE_CLONE_LANGUAGES = {
101
 
102
  PRESET_VOICES = [
103
  "Cherry -- Sunny, friendly",
104
- "Serena -- Gentle, soft",
105
  "Jennifer -- Cinematic narrator",
106
  "Katerina -- Mature, rich rhythm",
107
  "Ethan -- Warm, energetic",
108
  "Ryan -- Dramatic, rhythmic",
109
  "Kai -- Soothing, calm",
110
- "Neil -- Precise, clear",
111
- "Lenn -- Rational, steady",
112
  "Aiden -- Young, lively",
113
  "Eldric Sage -- Authoritative narrator",
114
  "Arthur -- Classic, mature",
115
- "Mia -- Young, versatile",
116
  "Bella -- Elegant, warm",
117
  "Vivian -- Professional, clear",
118
  "Seren -- Calm, measured",
119
  "Dolce -- Sweet, melodic",
120
- "Bellona -- Strong, commanding",
121
  "Vincent -- Rich, theatrical",
122
- "Andre -- Deep, resonant",
123
  ]
124
 
125
 
@@ -286,17 +259,69 @@ def split_text_into_chunks(text, max_chars=MAX_CHARS_PER_CHUNK):
286
  # ==============================
287
  # VOICE CLONING
288
  # ==============================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
289
  def clone_voice(audio_path, api_key, preferred_name="audiobook_voice"):
290
- filepath = pathlib.Path(audio_path)
291
- if not filepath.exists():
292
- raise FileNotFoundError(f"Audio file not found: {audio_path}")
293
 
294
- ext = filepath.suffix.lower()
295
- mime_map = {".wav": "audio/wav", ".mp3": "audio/mpeg", ".m4a": "audio/mp4"}
296
- mime_type = mime_map.get(ext, "audio/mpeg")
297
 
298
  b64_str = base64.b64encode(filepath.read_bytes()).decode()
299
- data_uri = f"data:{mime_type};base64,{b64_str}"
 
 
 
 
 
 
300
 
301
  payload = {
302
  "model": VOICE_CLONE_MODEL,
@@ -640,7 +665,7 @@ DESCRIPTION = """
640
  # Audiobook Generator
641
  ### English Text to Multi-Language Audiobook with Voice Cloning
642
 
643
- Upload English text and generate a narrated audiobook in **36 languages**.
644
  Choose a **preset voice** or **clone any voice** from a short audio sample!
645
 
646
  """
@@ -721,7 +746,7 @@ with gr.Blocks(
721
  )
722
 
723
  clone_audio = gr.Audio(
724
- label="Upload Voice Sample (10-60s of clear speech, WAV/MP3/M4A)",
725
  type="filepath",
726
  visible=False,
727
  )
@@ -729,10 +754,10 @@ with gr.Blocks(
729
  clone_info = gr.Markdown(
730
  value=(
731
  "> **Voice cloning tips:**\n"
732
- "> - Use 10-60 seconds of clear, single-speaker audio\n"
 
733
  "> - No background music or noise\n"
734
  "> - WAV (16-bit), MP3, or M4A format\n"
735
- "> - Sample rate at least 24 kHz recommended\n"
736
  "> - Cloned voice TTS supports 10 core languages only"
737
  ),
738
  visible=False,
 
68
  "Italian": {"code": "it", "native": "Italiano", "tier": "core"},
69
  "Arabic": {"code": "ar", "native": "Arabic", "tier": "extended"},
70
  "Dutch": {"code": "nl", "native": "Nederlands", "tier": "extended"},
 
 
 
 
 
 
71
  "Hindi": {"code": "hi", "native": "Hindi", "tier": "extended"},
 
72
  "Urdu": {"code": "ur", "native": "Urdu", "tier": "extended"},
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  "Swahili": {"code": "sw", "native": "Kiswahili", "tier": "extended"},
 
74
  }
75
 
76
  VOICE_CLONE_LANGUAGES = {
 
80
 
81
  PRESET_VOICES = [
82
  "Cherry -- Sunny, friendly",
 
83
  "Jennifer -- Cinematic narrator",
84
  "Katerina -- Mature, rich rhythm",
85
  "Ethan -- Warm, energetic",
86
  "Ryan -- Dramatic, rhythmic",
87
  "Kai -- Soothing, calm",
 
 
88
  "Aiden -- Young, lively",
89
  "Eldric Sage -- Authoritative narrator",
90
  "Arthur -- Classic, mature",
 
91
  "Bella -- Elegant, warm",
92
  "Vivian -- Professional, clear",
93
  "Seren -- Calm, measured",
94
  "Dolce -- Sweet, melodic",
 
95
  "Vincent -- Rich, theatrical",
 
96
  ]
97
 
98
 
 
259
  # ==============================
260
  # VOICE CLONING
261
  # ==============================
262
+ def prepare_clone_audio(audio_path):
263
+ """
264
+ Prepare audio for voice cloning:
265
+ - Accept 10s to 3min input
266
+ - Trim to best 60s (API max) from the middle for voice consistency
267
+ - Convert to mono WAV at 24kHz for best quality
268
+ Returns path to the prepared file.
269
+ """
270
+ # Get duration
271
+ result = subprocess.run(
272
+ ["ffprobe", "-v", "quiet", "-show_entries", "format=duration",
273
+ "-of", "default=noprint_wrappers=1:nokey=1", audio_path],
274
+ capture_output=True, text=True,
275
+ )
276
+ duration = float(result.stdout.strip())
277
+
278
+ if duration < 10:
279
+ raise ValueError(
280
+ f"Audio is too short ({duration:.1f}s). "
281
+ f"Please provide at least 10 seconds of clear speech."
282
+ )
283
+
284
+ # If under 60s, just convert format; if over 60s, take the best 60s
285
+ tmp_prepared = audio_path + "_prepared.wav"
286
+
287
+ if duration <= 60:
288
+ # Convert to proper format (mono, 24kHz, 16-bit WAV)
289
+ subprocess.run(
290
+ ["ffmpeg", "-y", "-i", audio_path,
291
+ "-ar", "24000", "-ac", "1", "-acodec", "pcm_s16le",
292
+ tmp_prepared],
293
+ capture_output=True, check=True,
294
+ )
295
+ else:
296
+ # Take 60s from 5s into the audio (skip intro silence/noise)
297
+ start = min(5, duration - 60)
298
+ subprocess.run(
299
+ ["ffmpeg", "-y", "-ss", str(start), "-t", "60",
300
+ "-i", audio_path,
301
+ "-ar", "24000", "-ac", "1", "-acodec", "pcm_s16le",
302
+ tmp_prepared],
303
+ capture_output=True, check=True,
304
+ )
305
+
306
+ return tmp_prepared
307
+
308
+
309
  def clone_voice(audio_path, api_key, preferred_name="audiobook_voice"):
310
+ # Prepare audio (trim if needed, convert format)
311
+ prepared_path = prepare_clone_audio(audio_path)
 
312
 
313
+ filepath = pathlib.Path(prepared_path)
314
+ if not filepath.exists():
315
+ raise FileNotFoundError(f"Prepared audio file not found: {prepared_path}")
316
 
317
  b64_str = base64.b64encode(filepath.read_bytes()).decode()
318
+ data_uri = f"data:audio/wav;base64,{b64_str}"
319
+
320
+ # Clean up prepared file
321
+ try:
322
+ os.remove(prepared_path)
323
+ except OSError:
324
+ pass
325
 
326
  payload = {
327
  "model": VOICE_CLONE_MODEL,
 
665
  # Audiobook Generator
666
  ### English Text to Multi-Language Audiobook with Voice Cloning
667
 
668
+ Upload English text and generate a narrated audiobook in **selected languages**.
669
  Choose a **preset voice** or **clone any voice** from a short audio sample!
670
 
671
  """
 
746
  )
747
 
748
  clone_audio = gr.Audio(
749
+ label="Upload Voice Sample (10 seconds to 3 minutes, WAV/MP3/M4A)",
750
  type="filepath",
751
  visible=False,
752
  )
 
754
  clone_info = gr.Markdown(
755
  value=(
756
  "> **Voice cloning tips:**\n"
757
+ "> - Use 10 seconds to 3 minutes of clear, single-speaker audio\n"
758
+ "> - Longer samples give better voice quality (auto-trimmed to best 60s)\n"
759
  "> - No background music or noise\n"
760
  "> - WAV (16-bit), MP3, or M4A format\n"
 
761
  "> - Cloned voice TTS supports 10 core languages only"
762
  ),
763
  visible=False,