Spaces:

PlotweaverModel
/

AudioBook

Running

App Files Files Community

PlotweaverModel commited on 10 days ago

Commit

e2d9fde

verified ·

1 Parent(s): 21d72c9

app.py updated

Browse files

Files changed (1) hide show

app.py +63 -38

app.py CHANGED Viewed

@@ -68,30 +68,9 @@ LANGUAGES = {
     "Italian": {"code": "it", "native": "Italiano", "tier": "core"},
     "Arabic": {"code": "ar", "native": "Arabic", "tier": "extended"},
     "Dutch": {"code": "nl", "native": "Nederlands", "tier": "extended"},
-    "Polish": {"code": "pl", "native": "Polski", "tier": "extended"},
-    "Turkish": {"code": "tr", "native": "Turkce", "tier": "extended"},
-    "Vietnamese": {"code": "vi", "native": "Tieng Viet", "tier": "extended"},
-    "Thai": {"code": "th", "native": "Thai", "tier": "extended"},
-    "Indonesian": {"code": "id", "native": "Bahasa Indonesia", "tier": "extended"},
-    "Malay": {"code": "ms", "native": "Bahasa Melayu", "tier": "extended"},
     "Hindi": {"code": "hi", "native": "Hindi", "tier": "extended"},
-    "Bengali": {"code": "bn", "native": "Bengali", "tier": "extended"},
     "Urdu": {"code": "ur", "native": "Urdu", "tier": "extended"},
-    "Swedish": {"code": "sv", "native": "Svenska", "tier": "extended"},
-    "Czech": {"code": "cs", "native": "Cestina", "tier": "extended"},
-    "Romanian": {"code": "ro", "native": "Romana", "tier": "extended"},
-    "Greek": {"code": "el", "native": "Greek", "tier": "extended"},
-    "Hungarian": {"code": "hu", "native": "Magyar", "tier": "extended"},
-    "Finnish": {"code": "fi", "native": "Suomi", "tier": "extended"},
-    "Danish": {"code": "da", "native": "Dansk", "tier": "extended"},
-    "Norwegian": {"code": "no", "native": "Norsk", "tier": "extended"},
-    "Ukrainian": {"code": "uk", "native": "Ukrainian", "tier": "extended"},
-    "Hebrew": {"code": "he", "native": "Hebrew", "tier": "extended"},
-    "Persian": {"code": "fa", "native": "Farsi", "tier": "extended"},
-    "Cantonese": {"code": "yue", "native": "Cantonese", "tier": "extended"},
-    "Filipino": {"code": "fil", "native": "Filipino", "tier": "extended"},
     "Swahili": {"code": "sw", "native": "Kiswahili", "tier": "extended"},
-    "Tamil": {"code": "ta", "native": "Tamil", "tier": "extended"},
 }
 VOICE_CLONE_LANGUAGES = {
@@ -101,25 +80,19 @@ VOICE_CLONE_LANGUAGES = {
 PRESET_VOICES = [
     "Cherry -- Sunny, friendly",
-    "Serena -- Gentle, soft",
     "Jennifer -- Cinematic narrator",
     "Katerina -- Mature, rich rhythm",
     "Ethan -- Warm, energetic",
     "Ryan -- Dramatic, rhythmic",
     "Kai -- Soothing, calm",
-    "Neil -- Precise, clear",
-    "Lenn -- Rational, steady",
     "Aiden -- Young, lively",
     "Eldric Sage -- Authoritative narrator",
     "Arthur -- Classic, mature",
-    "Mia -- Young, versatile",
     "Bella -- Elegant, warm",
     "Vivian -- Professional, clear",
     "Seren -- Calm, measured",
     "Dolce -- Sweet, melodic",
-    "Bellona -- Strong, commanding",
     "Vincent -- Rich, theatrical",
-    "Andre -- Deep, resonant",
 ]
@@ -286,17 +259,69 @@ def split_text_into_chunks(text, max_chars=MAX_CHARS_PER_CHUNK):
 # ==============================
 # VOICE CLONING
 # ==============================
 def clone_voice(audio_path, api_key, preferred_name="audiobook_voice"):
-    filepath = pathlib.Path(audio_path)
-    if not filepath.exists():
-        raise FileNotFoundError(f"Audio file not found: {audio_path}")
-    ext = filepath.suffix.lower()
-    mime_map = {".wav": "audio/wav", ".mp3": "audio/mpeg", ".m4a": "audio/mp4"}
-    mime_type = mime_map.get(ext, "audio/mpeg")
     b64_str = base64.b64encode(filepath.read_bytes()).decode()
-    data_uri = f"data:{mime_type};base64,{b64_str}"
     payload = {
         "model": VOICE_CLONE_MODEL,
@@ -640,7 +665,7 @@ DESCRIPTION = """
 # Audiobook Generator
 ### English Text to Multi-Language Audiobook with Voice Cloning
-Upload English text and generate a narrated audiobook in **36 languages**.
 Choose a **preset voice** or **clone any voice** from a short audio sample!
 """
@@ -721,7 +746,7 @@ with gr.Blocks(
             )
             clone_audio = gr.Audio(
-                label="Upload Voice Sample (10-60s of clear speech, WAV/MP3/M4A)",
                 type="filepath",
                 visible=False,
             )
@@ -729,10 +754,10 @@ with gr.Blocks(
             clone_info = gr.Markdown(
                 value=(
                     "> **Voice cloning tips:**\n"
-                    "> - Use 10-60 seconds of clear, single-speaker audio\n"
                     "> - No background music or noise\n"
                     "> - WAV (16-bit), MP3, or M4A format\n"
-                    "> - Sample rate at least 24 kHz recommended\n"
                     "> - Cloned voice TTS supports 10 core languages only"
                 ),
                 visible=False,

     "Italian": {"code": "it", "native": "Italiano", "tier": "core"},
     "Arabic": {"code": "ar", "native": "Arabic", "tier": "extended"},
     "Dutch": {"code": "nl", "native": "Nederlands", "tier": "extended"},
     "Hindi": {"code": "hi", "native": "Hindi", "tier": "extended"},
     "Urdu": {"code": "ur", "native": "Urdu", "tier": "extended"},
     "Swahili": {"code": "sw", "native": "Kiswahili", "tier": "extended"},
 }
 VOICE_CLONE_LANGUAGES = {
 PRESET_VOICES = [
     "Cherry -- Sunny, friendly",
     "Jennifer -- Cinematic narrator",
     "Katerina -- Mature, rich rhythm",
     "Ethan -- Warm, energetic",
     "Ryan -- Dramatic, rhythmic",
     "Kai -- Soothing, calm",
     "Aiden -- Young, lively",
     "Eldric Sage -- Authoritative narrator",
     "Arthur -- Classic, mature",
     "Bella -- Elegant, warm",
     "Vivian -- Professional, clear",
     "Seren -- Calm, measured",
     "Dolce -- Sweet, melodic",
     "Vincent -- Rich, theatrical",
 ]
 # ==============================
 # VOICE CLONING
 # ==============================
+def prepare_clone_audio(audio_path):
+    """
+    Prepare audio for voice cloning:
+    - Accept 10s to 3min input
+    - Trim to best 60s (API max) from the middle for voice consistency
+    - Convert to mono WAV at 24kHz for best quality
+    Returns path to the prepared file.
+    """
+    # Get duration
+    result = subprocess.run(
+        ["ffprobe", "-v", "quiet", "-show_entries", "format=duration",
+         "-of", "default=noprint_wrappers=1:nokey=1", audio_path],
+        capture_output=True, text=True,
+    )
+    duration = float(result.stdout.strip())
+    if duration < 10:
+        raise ValueError(
+            f"Audio is too short ({duration:.1f}s). "
+            f"Please provide at least 10 seconds of clear speech."
+        )
+    # If under 60s, just convert format; if over 60s, take the best 60s
+    tmp_prepared = audio_path + "_prepared.wav"
+    if duration <= 60:
+        # Convert to proper format (mono, 24kHz, 16-bit WAV)
+        subprocess.run(
+            ["ffmpeg", "-y", "-i", audio_path,
+             "-ar", "24000", "-ac", "1", "-acodec", "pcm_s16le",
+             tmp_prepared],
+            capture_output=True, check=True,
+        )
+    else:
+        # Take 60s from 5s into the audio (skip intro silence/noise)
+        start = min(5, duration - 60)
+        subprocess.run(
+            ["ffmpeg", "-y", "-ss", str(start), "-t", "60",
+             "-i", audio_path,
+             "-ar", "24000", "-ac", "1", "-acodec", "pcm_s16le",
+             tmp_prepared],
+            capture_output=True, check=True,
+        )
+    return tmp_prepared
 def clone_voice(audio_path, api_key, preferred_name="audiobook_voice"):
+    # Prepare audio (trim if needed, convert format)
+    prepared_path = prepare_clone_audio(audio_path)
+    filepath = pathlib.Path(prepared_path)
+    if not filepath.exists():
+        raise FileNotFoundError(f"Prepared audio file not found: {prepared_path}")
     b64_str = base64.b64encode(filepath.read_bytes()).decode()
+    data_uri = f"data:audio/wav;base64,{b64_str}"
+    # Clean up prepared file
+    try:
+        os.remove(prepared_path)
+    except OSError:
+        pass
     payload = {
         "model": VOICE_CLONE_MODEL,
 # Audiobook Generator
 ### English Text to Multi-Language Audiobook with Voice Cloning
+Upload English text and generate a narrated audiobook in **selected languages**.
 Choose a **preset voice** or **clone any voice** from a short audio sample!
 """
             )
             clone_audio = gr.Audio(
+                label="Upload Voice Sample (10 seconds to 3 minutes, WAV/MP3/M4A)",
                 type="filepath",
                 visible=False,
             )
             clone_info = gr.Markdown(
                 value=(
                     "> **Voice cloning tips:**\n"
+                    "> - Use 10 seconds to 3 minutes of clear, single-speaker audio\n"
+                    "> - Longer samples give better voice quality (auto-trimmed to best 60s)\n"
                     "> - No background music or noise\n"
                     "> - WAV (16-bit), MP3, or M4A format\n"
                     "> - Cloned voice TTS supports 10 core languages only"
                 ),
                 visible=False,