Spaces:
Running
Running
app.py updated
Browse files
app.py
CHANGED
|
@@ -68,30 +68,9 @@ LANGUAGES = {
|
|
| 68 |
"Italian": {"code": "it", "native": "Italiano", "tier": "core"},
|
| 69 |
"Arabic": {"code": "ar", "native": "Arabic", "tier": "extended"},
|
| 70 |
"Dutch": {"code": "nl", "native": "Nederlands", "tier": "extended"},
|
| 71 |
-
"Polish": {"code": "pl", "native": "Polski", "tier": "extended"},
|
| 72 |
-
"Turkish": {"code": "tr", "native": "Turkce", "tier": "extended"},
|
| 73 |
-
"Vietnamese": {"code": "vi", "native": "Tieng Viet", "tier": "extended"},
|
| 74 |
-
"Thai": {"code": "th", "native": "Thai", "tier": "extended"},
|
| 75 |
-
"Indonesian": {"code": "id", "native": "Bahasa Indonesia", "tier": "extended"},
|
| 76 |
-
"Malay": {"code": "ms", "native": "Bahasa Melayu", "tier": "extended"},
|
| 77 |
"Hindi": {"code": "hi", "native": "Hindi", "tier": "extended"},
|
| 78 |
-
"Bengali": {"code": "bn", "native": "Bengali", "tier": "extended"},
|
| 79 |
"Urdu": {"code": "ur", "native": "Urdu", "tier": "extended"},
|
| 80 |
-
"Swedish": {"code": "sv", "native": "Svenska", "tier": "extended"},
|
| 81 |
-
"Czech": {"code": "cs", "native": "Cestina", "tier": "extended"},
|
| 82 |
-
"Romanian": {"code": "ro", "native": "Romana", "tier": "extended"},
|
| 83 |
-
"Greek": {"code": "el", "native": "Greek", "tier": "extended"},
|
| 84 |
-
"Hungarian": {"code": "hu", "native": "Magyar", "tier": "extended"},
|
| 85 |
-
"Finnish": {"code": "fi", "native": "Suomi", "tier": "extended"},
|
| 86 |
-
"Danish": {"code": "da", "native": "Dansk", "tier": "extended"},
|
| 87 |
-
"Norwegian": {"code": "no", "native": "Norsk", "tier": "extended"},
|
| 88 |
-
"Ukrainian": {"code": "uk", "native": "Ukrainian", "tier": "extended"},
|
| 89 |
-
"Hebrew": {"code": "he", "native": "Hebrew", "tier": "extended"},
|
| 90 |
-
"Persian": {"code": "fa", "native": "Farsi", "tier": "extended"},
|
| 91 |
-
"Cantonese": {"code": "yue", "native": "Cantonese", "tier": "extended"},
|
| 92 |
-
"Filipino": {"code": "fil", "native": "Filipino", "tier": "extended"},
|
| 93 |
"Swahili": {"code": "sw", "native": "Kiswahili", "tier": "extended"},
|
| 94 |
-
"Tamil": {"code": "ta", "native": "Tamil", "tier": "extended"},
|
| 95 |
}
|
| 96 |
|
| 97 |
VOICE_CLONE_LANGUAGES = {
|
|
@@ -101,25 +80,19 @@ VOICE_CLONE_LANGUAGES = {
|
|
| 101 |
|
| 102 |
PRESET_VOICES = [
|
| 103 |
"Cherry -- Sunny, friendly",
|
| 104 |
-
"Serena -- Gentle, soft",
|
| 105 |
"Jennifer -- Cinematic narrator",
|
| 106 |
"Katerina -- Mature, rich rhythm",
|
| 107 |
"Ethan -- Warm, energetic",
|
| 108 |
"Ryan -- Dramatic, rhythmic",
|
| 109 |
"Kai -- Soothing, calm",
|
| 110 |
-
"Neil -- Precise, clear",
|
| 111 |
-
"Lenn -- Rational, steady",
|
| 112 |
"Aiden -- Young, lively",
|
| 113 |
"Eldric Sage -- Authoritative narrator",
|
| 114 |
"Arthur -- Classic, mature",
|
| 115 |
-
"Mia -- Young, versatile",
|
| 116 |
"Bella -- Elegant, warm",
|
| 117 |
"Vivian -- Professional, clear",
|
| 118 |
"Seren -- Calm, measured",
|
| 119 |
"Dolce -- Sweet, melodic",
|
| 120 |
-
"Bellona -- Strong, commanding",
|
| 121 |
"Vincent -- Rich, theatrical",
|
| 122 |
-
"Andre -- Deep, resonant",
|
| 123 |
]
|
| 124 |
|
| 125 |
|
|
@@ -286,17 +259,69 @@ def split_text_into_chunks(text, max_chars=MAX_CHARS_PER_CHUNK):
|
|
| 286 |
# ==============================
|
| 287 |
# VOICE CLONING
|
| 288 |
# ==============================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
def clone_voice(audio_path, api_key, preferred_name="audiobook_voice"):
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
raise FileNotFoundError(f"Audio file not found: {audio_path}")
|
| 293 |
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
|
| 298 |
b64_str = base64.b64encode(filepath.read_bytes()).decode()
|
| 299 |
-
data_uri = f"data:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 300 |
|
| 301 |
payload = {
|
| 302 |
"model": VOICE_CLONE_MODEL,
|
|
@@ -640,7 +665,7 @@ DESCRIPTION = """
|
|
| 640 |
# Audiobook Generator
|
| 641 |
### English Text to Multi-Language Audiobook with Voice Cloning
|
| 642 |
|
| 643 |
-
Upload English text and generate a narrated audiobook in **
|
| 644 |
Choose a **preset voice** or **clone any voice** from a short audio sample!
|
| 645 |
|
| 646 |
"""
|
|
@@ -721,7 +746,7 @@ with gr.Blocks(
|
|
| 721 |
)
|
| 722 |
|
| 723 |
clone_audio = gr.Audio(
|
| 724 |
-
label="Upload Voice Sample (10
|
| 725 |
type="filepath",
|
| 726 |
visible=False,
|
| 727 |
)
|
|
@@ -729,10 +754,10 @@ with gr.Blocks(
|
|
| 729 |
clone_info = gr.Markdown(
|
| 730 |
value=(
|
| 731 |
"> **Voice cloning tips:**\n"
|
| 732 |
-
"> - Use 10
|
|
|
|
| 733 |
"> - No background music or noise\n"
|
| 734 |
"> - WAV (16-bit), MP3, or M4A format\n"
|
| 735 |
-
"> - Sample rate at least 24 kHz recommended\n"
|
| 736 |
"> - Cloned voice TTS supports 10 core languages only"
|
| 737 |
),
|
| 738 |
visible=False,
|
|
|
|
| 68 |
"Italian": {"code": "it", "native": "Italiano", "tier": "core"},
|
| 69 |
"Arabic": {"code": "ar", "native": "Arabic", "tier": "extended"},
|
| 70 |
"Dutch": {"code": "nl", "native": "Nederlands", "tier": "extended"},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
"Hindi": {"code": "hi", "native": "Hindi", "tier": "extended"},
|
|
|
|
| 72 |
"Urdu": {"code": "ur", "native": "Urdu", "tier": "extended"},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
"Swahili": {"code": "sw", "native": "Kiswahili", "tier": "extended"},
|
|
|
|
| 74 |
}
|
| 75 |
|
| 76 |
VOICE_CLONE_LANGUAGES = {
|
|
|
|
| 80 |
|
| 81 |
PRESET_VOICES = [
|
| 82 |
"Cherry -- Sunny, friendly",
|
|
|
|
| 83 |
"Jennifer -- Cinematic narrator",
|
| 84 |
"Katerina -- Mature, rich rhythm",
|
| 85 |
"Ethan -- Warm, energetic",
|
| 86 |
"Ryan -- Dramatic, rhythmic",
|
| 87 |
"Kai -- Soothing, calm",
|
|
|
|
|
|
|
| 88 |
"Aiden -- Young, lively",
|
| 89 |
"Eldric Sage -- Authoritative narrator",
|
| 90 |
"Arthur -- Classic, mature",
|
|
|
|
| 91 |
"Bella -- Elegant, warm",
|
| 92 |
"Vivian -- Professional, clear",
|
| 93 |
"Seren -- Calm, measured",
|
| 94 |
"Dolce -- Sweet, melodic",
|
|
|
|
| 95 |
"Vincent -- Rich, theatrical",
|
|
|
|
| 96 |
]
|
| 97 |
|
| 98 |
|
|
|
|
| 259 |
# ==============================
|
| 260 |
# VOICE CLONING
|
| 261 |
# ==============================
|
| 262 |
+
def prepare_clone_audio(audio_path):
|
| 263 |
+
"""
|
| 264 |
+
Prepare audio for voice cloning:
|
| 265 |
+
- Accept 10s to 3min input
|
| 266 |
+
- Trim to best 60s (API max) from the middle for voice consistency
|
| 267 |
+
- Convert to mono WAV at 24kHz for best quality
|
| 268 |
+
Returns path to the prepared file.
|
| 269 |
+
"""
|
| 270 |
+
# Get duration
|
| 271 |
+
result = subprocess.run(
|
| 272 |
+
["ffprobe", "-v", "quiet", "-show_entries", "format=duration",
|
| 273 |
+
"-of", "default=noprint_wrappers=1:nokey=1", audio_path],
|
| 274 |
+
capture_output=True, text=True,
|
| 275 |
+
)
|
| 276 |
+
duration = float(result.stdout.strip())
|
| 277 |
+
|
| 278 |
+
if duration < 10:
|
| 279 |
+
raise ValueError(
|
| 280 |
+
f"Audio is too short ({duration:.1f}s). "
|
| 281 |
+
f"Please provide at least 10 seconds of clear speech."
|
| 282 |
+
)
|
| 283 |
+
|
| 284 |
+
# If under 60s, just convert format; if over 60s, take the best 60s
|
| 285 |
+
tmp_prepared = audio_path + "_prepared.wav"
|
| 286 |
+
|
| 287 |
+
if duration <= 60:
|
| 288 |
+
# Convert to proper format (mono, 24kHz, 16-bit WAV)
|
| 289 |
+
subprocess.run(
|
| 290 |
+
["ffmpeg", "-y", "-i", audio_path,
|
| 291 |
+
"-ar", "24000", "-ac", "1", "-acodec", "pcm_s16le",
|
| 292 |
+
tmp_prepared],
|
| 293 |
+
capture_output=True, check=True,
|
| 294 |
+
)
|
| 295 |
+
else:
|
| 296 |
+
# Take 60s from 5s into the audio (skip intro silence/noise)
|
| 297 |
+
start = min(5, duration - 60)
|
| 298 |
+
subprocess.run(
|
| 299 |
+
["ffmpeg", "-y", "-ss", str(start), "-t", "60",
|
| 300 |
+
"-i", audio_path,
|
| 301 |
+
"-ar", "24000", "-ac", "1", "-acodec", "pcm_s16le",
|
| 302 |
+
tmp_prepared],
|
| 303 |
+
capture_output=True, check=True,
|
| 304 |
+
)
|
| 305 |
+
|
| 306 |
+
return tmp_prepared
|
| 307 |
+
|
| 308 |
+
|
| 309 |
def clone_voice(audio_path, api_key, preferred_name="audiobook_voice"):
|
| 310 |
+
# Prepare audio (trim if needed, convert format)
|
| 311 |
+
prepared_path = prepare_clone_audio(audio_path)
|
|
|
|
| 312 |
|
| 313 |
+
filepath = pathlib.Path(prepared_path)
|
| 314 |
+
if not filepath.exists():
|
| 315 |
+
raise FileNotFoundError(f"Prepared audio file not found: {prepared_path}")
|
| 316 |
|
| 317 |
b64_str = base64.b64encode(filepath.read_bytes()).decode()
|
| 318 |
+
data_uri = f"data:audio/wav;base64,{b64_str}"
|
| 319 |
+
|
| 320 |
+
# Clean up prepared file
|
| 321 |
+
try:
|
| 322 |
+
os.remove(prepared_path)
|
| 323 |
+
except OSError:
|
| 324 |
+
pass
|
| 325 |
|
| 326 |
payload = {
|
| 327 |
"model": VOICE_CLONE_MODEL,
|
|
|
|
| 665 |
# Audiobook Generator
|
| 666 |
### English Text to Multi-Language Audiobook with Voice Cloning
|
| 667 |
|
| 668 |
+
Upload English text and generate a narrated audiobook in **selected languages**.
|
| 669 |
Choose a **preset voice** or **clone any voice** from a short audio sample!
|
| 670 |
|
| 671 |
"""
|
|
|
|
| 746 |
)
|
| 747 |
|
| 748 |
clone_audio = gr.Audio(
|
| 749 |
+
label="Upload Voice Sample (10 seconds to 3 minutes, WAV/MP3/M4A)",
|
| 750 |
type="filepath",
|
| 751 |
visible=False,
|
| 752 |
)
|
|
|
|
| 754 |
clone_info = gr.Markdown(
|
| 755 |
value=(
|
| 756 |
"> **Voice cloning tips:**\n"
|
| 757 |
+
"> - Use 10 seconds to 3 minutes of clear, single-speaker audio\n"
|
| 758 |
+
"> - Longer samples give better voice quality (auto-trimmed to best 60s)\n"
|
| 759 |
"> - No background music or noise\n"
|
| 760 |
"> - WAV (16-bit), MP3, or M4A format\n"
|
|
|
|
| 761 |
"> - Cloned voice TTS supports 10 core languages only"
|
| 762 |
),
|
| 763 |
visible=False,
|