Spaces:
Running
Running
Files upload
Browse files- app.py +637 -0
- requirements.txt +2 -0
app.py
ADDED
|
@@ -0,0 +1,637 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
📖 Audiobook Generator — English Source to Multi-Language Audio
|
| 3 |
+
Powered by Qwen3.5-Omni-Plus via DashScope API
|
| 4 |
+
|
| 5 |
+
Two modes:
|
| 6 |
+
1. Translation + TTS: Translate English text to target language, then generate speech
|
| 7 |
+
2. Direct TTS: Generate speech from English text directly
|
| 8 |
+
|
| 9 |
+
Deploy as a Hugging Face Space:
|
| 10 |
+
1. Create a new Space (SDK: Gradio)
|
| 11 |
+
2. Upload app.py and requirements.txt
|
| 12 |
+
3. Add DASHSCOPE_API_KEY as a Space Secret
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import os
|
| 16 |
+
import base64
|
| 17 |
+
import math
|
| 18 |
+
import shutil
|
| 19 |
+
import struct
|
| 20 |
+
import subprocess
|
| 21 |
+
import tempfile
|
| 22 |
+
import time
|
| 23 |
+
import re
|
| 24 |
+
|
| 25 |
+
import gradio as gr
|
| 26 |
+
from openai import OpenAI
|
| 27 |
+
|
| 28 |
+
# ──────────────────────────────────────────────
|
| 29 |
+
# Configuration
|
| 30 |
+
# ──────────────────────────────────────────────
|
| 31 |
+
MODEL = "qwen3.5-omni-plus"
|
| 32 |
+
BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
|
| 33 |
+
|
| 34 |
+
# Maximum characters per chunk sent to the API
|
| 35 |
+
# The model has token limits, so we split long texts
|
| 36 |
+
MAX_CHARS_PER_CHUNK = 1500
|
| 37 |
+
|
| 38 |
+
# All 36 speech output languages supported by Qwen3.5-Omni
|
| 39 |
+
# Core 10 languages have the best quality; extended languages are supported
|
| 40 |
+
# but may vary in quality as they include dialects
|
| 41 |
+
LANGUAGES = {
|
| 42 |
+
# ── Core 10 Languages (highest quality) ──
|
| 43 |
+
"English": {"code": "en", "native": "English", "tier": "core"},
|
| 44 |
+
"Chinese (Mandarin)": {"code": "zh", "native": "中文", "tier": "core"},
|
| 45 |
+
"Japanese": {"code": "ja", "native": "日本語", "tier": "core"},
|
| 46 |
+
"Korean": {"code": "ko", "native": "한국어", "tier": "core"},
|
| 47 |
+
"German": {"code": "de", "native": "Deutsch", "tier": "core"},
|
| 48 |
+
"French": {"code": "fr", "native": "Français", "tier": "core"},
|
| 49 |
+
"Russian": {"code": "ru", "native": "Русский", "tier": "core"},
|
| 50 |
+
"Portuguese": {"code": "pt", "native": "Português", "tier": "core"},
|
| 51 |
+
"Spanish": {"code": "es", "native": "Español", "tier": "core"},
|
| 52 |
+
"Italian": {"code": "it", "native": "Italiano", "tier": "core"},
|
| 53 |
+
# ── Extended Languages (Qwen3.5-Omni expanded to 36) ──
|
| 54 |
+
"Arabic": {"code": "ar", "native": "العربية", "tier": "extended"},
|
| 55 |
+
"Dutch": {"code": "nl", "native": "Nederlands", "tier": "extended"},
|
| 56 |
+
"Polish": {"code": "pl", "native": "Polski", "tier": "extended"},
|
| 57 |
+
"Turkish": {"code": "tr", "native": "Türkçe", "tier": "extended"},
|
| 58 |
+
"Vietnamese": {"code": "vi", "native": "Tiếng Việt", "tier": "extended"},
|
| 59 |
+
"Thai": {"code": "th", "native": "ภาษาไทย", "tier": "extended"},
|
| 60 |
+
"Indonesian": {"code": "id", "native": "Bahasa Indonesia", "tier": "extended"},
|
| 61 |
+
"Malay": {"code": "ms", "native": "Bahasa Melayu", "tier": "extended"},
|
| 62 |
+
"Hindi": {"code": "hi", "native": "हिन्दी", "tier": "extended"},
|
| 63 |
+
"Bengali": {"code": "bn", "native": "বাংলা", "tier": "extended"},
|
| 64 |
+
"Urdu": {"code": "ur", "native": "اردو", "tier": "extended"},
|
| 65 |
+
"Swedish": {"code": "sv", "native": "Svenska", "tier": "extended"},
|
| 66 |
+
"Czech": {"code": "cs", "native": "Čeština", "tier": "extended"},
|
| 67 |
+
"Romanian": {"code": "ro", "native": "Română", "tier": "extended"},
|
| 68 |
+
"Greek": {"code": "el", "native": "Ελληνικά", "tier": "extended"},
|
| 69 |
+
"Hungarian": {"code": "hu", "native": "Magyar", "tier": "extended"},
|
| 70 |
+
"Finnish": {"code": "fi", "native": "Suomi", "tier": "extended"},
|
| 71 |
+
"Danish": {"code": "da", "native": "Dansk", "tier": "extended"},
|
| 72 |
+
"Norwegian": {"code": "no", "native": "Norsk", "tier": "extended"},
|
| 73 |
+
"Ukrainian": {"code": "uk", "native": "Українська", "tier": "extended"},
|
| 74 |
+
"Hebrew": {"code": "he", "native": "עברית", "tier": "extended"},
|
| 75 |
+
"Persian": {"code": "fa", "native": "فارسی", "tier": "extended"},
|
| 76 |
+
"Cantonese": {"code": "yue", "native": "粵語", "tier": "extended"},
|
| 77 |
+
"Filipino": {"code": "fil", "native": "Filipino", "tier": "extended"},
|
| 78 |
+
"Swahili": {"code": "sw", "native": "Kiswahili", "tier": "extended"},
|
| 79 |
+
"Tamil": {"code": "ta", "native": "தமிழ்", "tier": "extended"},
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
VOICES = {
|
| 83 |
+
"Male Voices": [
|
| 84 |
+
"Ethan — Warm, energetic",
|
| 85 |
+
"Ryan — Dramatic, rhythmic",
|
| 86 |
+
"Kai — Soothing, calm",
|
| 87 |
+
"Neil — Precise, clear",
|
| 88 |
+
"Lenn — Rational, steady",
|
| 89 |
+
"Aiden — Young, lively",
|
| 90 |
+
"Eldric Sage — Authoritative narrator",
|
| 91 |
+
"Arthur — Classic, mature",
|
| 92 |
+
"Elias — Soft, thoughtful",
|
| 93 |
+
"Alek — Confident, modern",
|
| 94 |
+
"Andre — Deep, resonant",
|
| 95 |
+
"Emilien — Gentle, French-inspired",
|
| 96 |
+
"Vincent — Rich, theatrical",
|
| 97 |
+
],
|
| 98 |
+
"Female Voices": [
|
| 99 |
+
"Cherry — Sunny, friendly",
|
| 100 |
+
"Serena — Gentle, soft",
|
| 101 |
+
"Jennifer — Cinematic narrator",
|
| 102 |
+
"Katerina — Mature, rich rhythm",
|
| 103 |
+
"Chelsie — Bright, expressive",
|
| 104 |
+
"Mia — Young, versatile",
|
| 105 |
+
"Bella — Elegant, warm",
|
| 106 |
+
"Vivian — Professional, clear",
|
| 107 |
+
"Moon — Dreamy, ethereal",
|
| 108 |
+
"Maia — Confident, articulate",
|
| 109 |
+
"Seren — Calm, measured",
|
| 110 |
+
"Dolce — Sweet, melodic",
|
| 111 |
+
"Bellona — Strong, commanding",
|
| 112 |
+
"Bunny — Playful, light",
|
| 113 |
+
"Momo — Cute, upbeat",
|
| 114 |
+
"Mochi — Soft, adorable",
|
| 115 |
+
],
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
# Flatten voice list for the dropdown
|
| 119 |
+
ALL_VOICES = []
|
| 120 |
+
for category, voices in VOICES.items():
|
| 121 |
+
for v in voices:
|
| 122 |
+
ALL_VOICES.append(v)
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def get_voice_name(voice_label: str) -> str:
|
| 126 |
+
"""Extract just the voice name from 'Name — Description' format."""
|
| 127 |
+
return voice_label.split("—")[0].strip()
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
# ──────────────────────────────────────────────
|
| 131 |
+
# Audio helpers
|
| 132 |
+
# ──────────────────────────────────────────────
|
| 133 |
+
def base64_to_wav(b64_data: str, output_path: str):
|
| 134 |
+
"""Decode base64 PCM data and write a proper WAV file."""
|
| 135 |
+
audio_bytes = base64.b64decode(b64_data)
|
| 136 |
+
sample_rate = 24000
|
| 137 |
+
num_channels = 1
|
| 138 |
+
bits_per_sample = 16
|
| 139 |
+
byte_rate = sample_rate * num_channels * bits_per_sample // 8
|
| 140 |
+
block_align = num_channels * bits_per_sample // 8
|
| 141 |
+
data_size = len(audio_bytes)
|
| 142 |
+
with open(output_path, "wb") as f:
|
| 143 |
+
f.write(b"RIFF")
|
| 144 |
+
f.write(struct.pack("<I", 36 + data_size))
|
| 145 |
+
f.write(b"WAVE")
|
| 146 |
+
f.write(b"fmt ")
|
| 147 |
+
f.write(struct.pack("<I", 16))
|
| 148 |
+
f.write(struct.pack("<H", 1))
|
| 149 |
+
f.write(struct.pack("<H", num_channels))
|
| 150 |
+
f.write(struct.pack("<I", sample_rate))
|
| 151 |
+
f.write(struct.pack("<I", byte_rate))
|
| 152 |
+
f.write(struct.pack("<H", block_align))
|
| 153 |
+
f.write(struct.pack("<H", bits_per_sample))
|
| 154 |
+
f.write(b"data")
|
| 155 |
+
f.write(struct.pack("<I", data_size))
|
| 156 |
+
f.write(audio_bytes)
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
def concatenate_wavs(wav_files: list, output_path: str):
|
| 160 |
+
"""Concatenate multiple WAV files using ffmpeg."""
|
| 161 |
+
if not wav_files:
|
| 162 |
+
return
|
| 163 |
+
if len(wav_files) == 1:
|
| 164 |
+
shutil.copy2(wav_files[0], output_path)
|
| 165 |
+
return
|
| 166 |
+
list_file = output_path + ".txt"
|
| 167 |
+
with open(list_file, "w") as f:
|
| 168 |
+
for wav in wav_files:
|
| 169 |
+
f.write(f"file '{wav}'\n")
|
| 170 |
+
subprocess.run(
|
| 171 |
+
["ffmpeg", "-y", "-f", "concat", "-safe", "0",
|
| 172 |
+
"-i", list_file, "-c", "copy", output_path],
|
| 173 |
+
capture_output=True, check=True,
|
| 174 |
+
)
|
| 175 |
+
os.remove(list_file)
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
# ──────────────────────────────────────────────
|
| 179 |
+
# Text splitting
|
| 180 |
+
# ──────────────────────────────────────────────
|
| 181 |
+
def split_text_into_chunks(text: str, max_chars: int = MAX_CHARS_PER_CHUNK) -> list:
|
| 182 |
+
"""
|
| 183 |
+
Split text into chunks at sentence boundaries.
|
| 184 |
+
Tries to keep paragraphs together when possible.
|
| 185 |
+
"""
|
| 186 |
+
# Normalize whitespace
|
| 187 |
+
text = text.strip()
|
| 188 |
+
if not text:
|
| 189 |
+
return []
|
| 190 |
+
|
| 191 |
+
# If short enough, return as-is
|
| 192 |
+
if len(text) <= max_chars:
|
| 193 |
+
return [text]
|
| 194 |
+
|
| 195 |
+
chunks = []
|
| 196 |
+
# First split by paragraphs
|
| 197 |
+
paragraphs = re.split(r"\n\s*\n", text)
|
| 198 |
+
|
| 199 |
+
current_chunk = ""
|
| 200 |
+
for para in paragraphs:
|
| 201 |
+
para = para.strip()
|
| 202 |
+
if not para:
|
| 203 |
+
continue
|
| 204 |
+
|
| 205 |
+
# If adding this paragraph keeps us under the limit
|
| 206 |
+
if len(current_chunk) + len(para) + 2 <= max_chars:
|
| 207 |
+
current_chunk = (current_chunk + "\n\n" + para).strip()
|
| 208 |
+
else:
|
| 209 |
+
# Save current chunk if it has content
|
| 210 |
+
if current_chunk:
|
| 211 |
+
chunks.append(current_chunk)
|
| 212 |
+
current_chunk = ""
|
| 213 |
+
|
| 214 |
+
# If the paragraph itself is too long, split by sentences
|
| 215 |
+
if len(para) > max_chars:
|
| 216 |
+
sentences = re.split(r"(?<=[.!?])\s+", para)
|
| 217 |
+
for sentence in sentences:
|
| 218 |
+
if len(current_chunk) + len(sentence) + 1 <= max_chars:
|
| 219 |
+
current_chunk = (current_chunk + " " + sentence).strip()
|
| 220 |
+
else:
|
| 221 |
+
if current_chunk:
|
| 222 |
+
chunks.append(current_chunk)
|
| 223 |
+
# If a single sentence is too long, force-split it
|
| 224 |
+
if len(sentence) > max_chars:
|
| 225 |
+
words = sentence.split()
|
| 226 |
+
current_chunk = ""
|
| 227 |
+
for word in words:
|
| 228 |
+
if len(current_chunk) + len(word) + 1 <= max_chars:
|
| 229 |
+
current_chunk = (current_chunk + " " + word).strip()
|
| 230 |
+
else:
|
| 231 |
+
if current_chunk:
|
| 232 |
+
chunks.append(current_chunk)
|
| 233 |
+
current_chunk = word
|
| 234 |
+
else:
|
| 235 |
+
current_chunk = sentence
|
| 236 |
+
else:
|
| 237 |
+
current_chunk = para
|
| 238 |
+
|
| 239 |
+
if current_chunk:
|
| 240 |
+
chunks.append(current_chunk)
|
| 241 |
+
|
| 242 |
+
return chunks
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
# ──────────────────────────────────────────────
|
| 246 |
+
# API: Generate speech for a text chunk
|
| 247 |
+
# ──────────────────────────────────────────────
|
| 248 |
+
def generate_speech_chunk(
|
| 249 |
+
client: OpenAI,
|
| 250 |
+
text: str,
|
| 251 |
+
voice: str,
|
| 252 |
+
language: str,
|
| 253 |
+
lang_config: dict,
|
| 254 |
+
translate: bool,
|
| 255 |
+
chunk_index: int,
|
| 256 |
+
output_dir: str,
|
| 257 |
+
) -> tuple:
|
| 258 |
+
"""
|
| 259 |
+
Send a text chunk to Qwen3.5-Omni-Plus and get back audio.
|
| 260 |
+
If translate=True, translates from English to target language and speaks.
|
| 261 |
+
If translate=False, speaks the text directly in English.
|
| 262 |
+
Returns (wav_path, transcript) or (None, error_msg).
|
| 263 |
+
"""
|
| 264 |
+
output_wav = os.path.join(output_dir, f"chunk_{chunk_index:04d}.wav")
|
| 265 |
+
|
| 266 |
+
if translate and language != "English":
|
| 267 |
+
system_prompt = (
|
| 268 |
+
f"You are a professional audiobook narrator and translator.\n"
|
| 269 |
+
f"You will receive English text. Your task:\n"
|
| 270 |
+
f"1. Translate the text into natural, fluent {language} ({lang_config['native']}).\n"
|
| 271 |
+
f"2. Read the translated text aloud with clear, expressive narration.\n"
|
| 272 |
+
f"3. Use an engaging audiobook narration style — vary your tone for dialogue,\n"
|
| 273 |
+
f" descriptions, and emotional moments.\n"
|
| 274 |
+
f"4. Respond ONLY with the spoken {language} narration — no English,\n"
|
| 275 |
+
f" no meta-commentary, no chapter headers unless they're in the text.\n"
|
| 276 |
+
f"5. Maintain a natural reading pace suitable for an audiobook.\n"
|
| 277 |
+
f"6. Translate idioms and cultural references appropriately."
|
| 278 |
+
)
|
| 279 |
+
user_text = (
|
| 280 |
+
f"Translate the following English text into {language} and narrate it "
|
| 281 |
+
f"as an audiobook. Respond only with the spoken {language} narration:\n\n{text}"
|
| 282 |
+
)
|
| 283 |
+
else:
|
| 284 |
+
system_prompt = (
|
| 285 |
+
"You are a professional audiobook narrator.\n"
|
| 286 |
+
"You will receive text to read aloud. Your task:\n"
|
| 287 |
+
"1. Read the text with clear, expressive narration.\n"
|
| 288 |
+
"2. Use an engaging audiobook narration style — vary your tone for dialogue,\n"
|
| 289 |
+
" descriptions, and emotional moments.\n"
|
| 290 |
+
"3. Respond ONLY with the spoken narration — no meta-commentary.\n"
|
| 291 |
+
"4. Maintain a natural reading pace suitable for an audiobook.\n"
|
| 292 |
+
"5. Pause appropriately between paragraphs and at punctuation."
|
| 293 |
+
)
|
| 294 |
+
user_text = f"Narrate the following text as an audiobook:\n\n{text}"
|
| 295 |
+
|
| 296 |
+
try:
|
| 297 |
+
completion = client.chat.completions.create(
|
| 298 |
+
model=MODEL,
|
| 299 |
+
messages=[
|
| 300 |
+
{"role": "system", "content": system_prompt},
|
| 301 |
+
{"role": "user", "content": user_text},
|
| 302 |
+
],
|
| 303 |
+
modalities=["text", "audio"],
|
| 304 |
+
audio={"voice": voice, "format": "wav"},
|
| 305 |
+
stream=True,
|
| 306 |
+
stream_options={"include_usage": True},
|
| 307 |
+
)
|
| 308 |
+
|
| 309 |
+
audio_chunks = []
|
| 310 |
+
transcript_parts = []
|
| 311 |
+
|
| 312 |
+
for event in completion:
|
| 313 |
+
if not event.choices:
|
| 314 |
+
continue
|
| 315 |
+
delta = event.choices[0].delta
|
| 316 |
+
if hasattr(delta, "content") and delta.content:
|
| 317 |
+
transcript_parts.append(delta.content)
|
| 318 |
+
if hasattr(delta, "audio") and delta.audio:
|
| 319 |
+
if isinstance(delta.audio, dict):
|
| 320 |
+
if "data" in delta.audio:
|
| 321 |
+
audio_chunks.append(delta.audio["data"])
|
| 322 |
+
elif hasattr(delta.audio, "data") and delta.audio.data:
|
| 323 |
+
audio_chunks.append(delta.audio.data)
|
| 324 |
+
|
| 325 |
+
transcript = "".join(transcript_parts)
|
| 326 |
+
|
| 327 |
+
if audio_chunks:
|
| 328 |
+
full_audio_b64 = "".join(audio_chunks)
|
| 329 |
+
base64_to_wav(full_audio_b64, output_wav)
|
| 330 |
+
return output_wav, transcript
|
| 331 |
+
else:
|
| 332 |
+
return None, "No audio received from API"
|
| 333 |
+
|
| 334 |
+
except Exception as e:
|
| 335 |
+
return None, str(e)
|
| 336 |
+
|
| 337 |
+
|
| 338 |
+
# ──────────────────────────────────────────────
|
| 339 |
+
# Generate silence between chapters/sections
|
| 340 |
+
# ──────────────────────────────────────────────
|
| 341 |
+
def generate_silence(duration_sec: float, output_path: str):
|
| 342 |
+
"""Generate a silent WAV file."""
|
| 343 |
+
subprocess.run(
|
| 344 |
+
["ffmpeg", "-y", "-f", "lavfi",
|
| 345 |
+
"-i", f"anullsrc=r=24000:cl=mono",
|
| 346 |
+
"-t", str(duration_sec), "-acodec", "pcm_s16le", output_path],
|
| 347 |
+
capture_output=True, check=True,
|
| 348 |
+
)
|
| 349 |
+
|
| 350 |
+
|
| 351 |
+
# ──────────────────────────────────────────��───
|
| 352 |
+
# Main pipeline
|
| 353 |
+
# ──────────────────────────────────────────────
|
| 354 |
+
def generate_audiobook(
|
| 355 |
+
text_input: str,
|
| 356 |
+
file_input,
|
| 357 |
+
target_language: str,
|
| 358 |
+
voice_label: str,
|
| 359 |
+
add_pauses: bool,
|
| 360 |
+
progress=gr.Progress(),
|
| 361 |
+
):
|
| 362 |
+
"""Main audiobook generation pipeline."""
|
| 363 |
+
|
| 364 |
+
# ── Resolve text source ──
|
| 365 |
+
if file_input is not None:
|
| 366 |
+
try:
|
| 367 |
+
with open(file_input, "r", encoding="utf-8", errors="replace") as f:
|
| 368 |
+
text = f.read()
|
| 369 |
+
except Exception as e:
|
| 370 |
+
raise gr.Error(f"Failed to read file: {e}")
|
| 371 |
+
elif text_input and text_input.strip():
|
| 372 |
+
text = text_input.strip()
|
| 373 |
+
else:
|
| 374 |
+
raise gr.Error("Please provide text or upload a file.")
|
| 375 |
+
|
| 376 |
+
if len(text) < 10:
|
| 377 |
+
raise gr.Error("Text is too short. Please provide more content.")
|
| 378 |
+
|
| 379 |
+
# ── API key ──
|
| 380 |
+
api_key = os.environ.get("DASHSCOPE_API_KEY", "")
|
| 381 |
+
if not api_key:
|
| 382 |
+
raise gr.Error(
|
| 383 |
+
"DASHSCOPE_API_KEY not set. Add it as a Space Secret "
|
| 384 |
+
"(Settings → Secrets → New Secret)."
|
| 385 |
+
)
|
| 386 |
+
|
| 387 |
+
voice = get_voice_name(voice_label)
|
| 388 |
+
lang_config = LANGUAGES[target_language]
|
| 389 |
+
translate = target_language != "English"
|
| 390 |
+
client = OpenAI(api_key=api_key, base_url=BASE_URL)
|
| 391 |
+
tmp_dir = tempfile.mkdtemp(prefix="audiobook_")
|
| 392 |
+
|
| 393 |
+
try:
|
| 394 |
+
# ── Split text ──
|
| 395 |
+
progress(0.05, desc="Splitting text into chunks...")
|
| 396 |
+
chunks = split_text_into_chunks(text)
|
| 397 |
+
total_chunks = len(chunks)
|
| 398 |
+
total_chars = sum(len(c) for c in chunks)
|
| 399 |
+
|
| 400 |
+
progress(0.08, desc=f"Processing {total_chunks} chunks ({total_chars:,} characters)...")
|
| 401 |
+
|
| 402 |
+
# ── Generate speech for each chunk ──
|
| 403 |
+
audio_files = []
|
| 404 |
+
all_transcripts = []
|
| 405 |
+
silence_path = os.path.join(tmp_dir, "silence.wav")
|
| 406 |
+
if add_pauses:
|
| 407 |
+
generate_silence(1.5, silence_path)
|
| 408 |
+
|
| 409 |
+
for i, chunk in enumerate(chunks):
|
| 410 |
+
frac = 0.1 + 0.8 * (i / total_chunks)
|
| 411 |
+
progress(frac, desc=f"Narrating chunk {i+1}/{total_chunks}...")
|
| 412 |
+
|
| 413 |
+
wav_path, transcript = generate_speech_chunk(
|
| 414 |
+
client, chunk, voice, target_language,
|
| 415 |
+
lang_config, translate, i, tmp_dir,
|
| 416 |
+
)
|
| 417 |
+
|
| 418 |
+
if wav_path:
|
| 419 |
+
audio_files.append(wav_path)
|
| 420 |
+
# Add pause between chunks
|
| 421 |
+
if add_pauses and i < total_chunks - 1:
|
| 422 |
+
audio_files.append(silence_path)
|
| 423 |
+
else:
|
| 424 |
+
all_transcripts.append(f"⚠️ Chunk {i+1} failed: {transcript}")
|
| 425 |
+
# Insert silence placeholder for failed chunk
|
| 426 |
+
fail_silence = os.path.join(tmp_dir, f"fail_silence_{i:04d}.wav")
|
| 427 |
+
generate_silence(2.0, fail_silence)
|
| 428 |
+
audio_files.append(fail_silence)
|
| 429 |
+
|
| 430 |
+
if transcript and not transcript.startswith("⚠️"):
|
| 431 |
+
all_transcripts.append(transcript)
|
| 432 |
+
|
| 433 |
+
if not audio_files:
|
| 434 |
+
raise gr.Error("No audio was generated. Check your API key and try again.")
|
| 435 |
+
|
| 436 |
+
# ── Concatenate all audio ──
|
| 437 |
+
progress(0.92, desc="Assembling audiobook...")
|
| 438 |
+
final_audio = os.path.join(tmp_dir, "audiobook.wav")
|
| 439 |
+
concatenate_wavs(audio_files, final_audio)
|
| 440 |
+
|
| 441 |
+
# ── Convert to MP3 for smaller file size ──
|
| 442 |
+
progress(0.96, desc="Converting to MP3...")
|
| 443 |
+
final_mp3 = os.path.join(tmp_dir, "audiobook.mp3")
|
| 444 |
+
subprocess.run(
|
| 445 |
+
["ffmpeg", "-y", "-i", final_audio,
|
| 446 |
+
"-codec:a", "libmp3lame", "-b:a", "128k",
|
| 447 |
+
"-ar", "24000", "-ac", "1", final_mp3],
|
| 448 |
+
capture_output=True, check=True,
|
| 449 |
+
)
|
| 450 |
+
|
| 451 |
+
progress(1.0, desc="Done!")
|
| 452 |
+
|
| 453 |
+
# Build transcript display
|
| 454 |
+
transcript_text = "\n\n---\n\n".join(all_transcripts) if all_transcripts else ""
|
| 455 |
+
|
| 456 |
+
# Stats
|
| 457 |
+
audio_size = os.path.getsize(final_mp3) / (1024 * 1024)
|
| 458 |
+
stats = (
|
| 459 |
+
f"**Audiobook Generated!**\n\n"
|
| 460 |
+
f"- **Source:** {total_chars:,} characters in {total_chunks} chunks\n"
|
| 461 |
+
f"- **Language:** {target_language} ({lang_config['native']})\n"
|
| 462 |
+
f"- **Voice:** {voice_label}\n"
|
| 463 |
+
f"- **File size:** {audio_size:.1f} MB\n"
|
| 464 |
+
f"- **Quality tier:** {lang_config['tier'].title()}\n"
|
| 465 |
+
)
|
| 466 |
+
if lang_config["tier"] == "extended":
|
| 467 |
+
stats += "\n> ⚠️ This is an extended language. Voice quality may vary compared to the core 10 languages."
|
| 468 |
+
|
| 469 |
+
return final_mp3, stats, transcript_text
|
| 470 |
+
|
| 471 |
+
except gr.Error:
|
| 472 |
+
raise
|
| 473 |
+
except Exception as e:
|
| 474 |
+
raise gr.Error(f"Pipeline error: {str(e)}")
|
| 475 |
+
finally:
|
| 476 |
+
# Don't clean up tmp_dir yet — Gradio needs the files
|
| 477 |
+
pass
|
| 478 |
+
|
| 479 |
+
|
| 480 |
+
# ──────────────────────────────────────────────
|
| 481 |
+
# Build language choices with tier labels
|
| 482 |
+
# ──────────────────────────────────────────────
|
| 483 |
+
def get_language_choices():
|
| 484 |
+
core = [f"⭐ {name}" for name, cfg in LANGUAGES.items() if cfg["tier"] == "core"]
|
| 485 |
+
extended = [f" {name}" for name, cfg in LANGUAGES.items() if cfg["tier"] == "extended"]
|
| 486 |
+
return core + extended
|
| 487 |
+
|
| 488 |
+
|
| 489 |
+
def clean_language_name(choice: str) -> str:
|
| 490 |
+
"""Remove the tier prefix from the dropdown choice."""
|
| 491 |
+
return choice.replace("⭐ ", "").replace(" ", "").strip()
|
| 492 |
+
|
| 493 |
+
|
| 494 |
+
def generate_wrapper(text_input, file_input, language_choice, voice, add_pauses, progress=gr.Progress()):
|
| 495 |
+
language = clean_language_name(language_choice)
|
| 496 |
+
return generate_audiobook(text_input, file_input, language, voice, add_pauses, progress)
|
| 497 |
+
|
| 498 |
+
|
| 499 |
+
# ──────────────────────────────────────────────
|
| 500 |
+
# Sample text
|
| 501 |
+
# ──────────────────────────────────────────────
|
| 502 |
+
SAMPLE_TEXT = """Chapter 1: The Beginning
|
| 503 |
+
|
| 504 |
+
The old lighthouse stood at the edge of the world, or so it seemed to the girl who had lived in its shadow all her life. Each morning, she would climb the winding iron staircase to the lamp room, counting exactly one hundred and forty-seven steps, and watch the sun rise from the sea like a great golden coin tossed by the gods.
|
| 505 |
+
|
| 506 |
+
"One day," she whispered to the seagulls that perched on the railing, "I'll follow that sun to wherever it goes."
|
| 507 |
+
|
| 508 |
+
The gulls, as always, said nothing. They merely tilted their heads and regarded her with ancient, knowing eyes before launching themselves into the wind.
|
| 509 |
+
|
| 510 |
+
Her name was Elena, and she was seventeen years old. She had hair the color of dark honey and eyes that changed with the weather — grey in storms, green in sunlight, and something altogether different in the strange purple twilight that sometimes settled over the coast in autumn.
|
| 511 |
+
|
| 512 |
+
The lighthouse keeper, her grandfather, was a man of few words but many stories. He kept them locked away like treasures in a chest, only bringing them out on winter nights when the storms howled outside and the old building trembled like a living thing.
|
| 513 |
+
|
| 514 |
+
"Tell me about the ships," Elena would say, curling up in the worn armchair by the fire.
|
| 515 |
+
|
| 516 |
+
And he would smile — that slow, careful smile that seemed to cost him something each time — and begin."""
|
| 517 |
+
|
| 518 |
+
|
| 519 |
+
# ──────────────────────────────────────────────
|
| 520 |
+
# Gradio UI
|
| 521 |
+
# ──────────────────────────────────────────────
|
| 522 |
+
DESCRIPTION = """
|
| 523 |
+
# 📖 Audiobook Generator
|
| 524 |
+
### English Text → Multi-Language Audiobook
|
| 525 |
+
**Powered by Qwen3.5-Omni-Plus**
|
| 526 |
+
|
| 527 |
+
Paste or upload English text and get a professionally narrated audiobook in any of **36 languages**.
|
| 528 |
+
The AI translates and narrates with expressive, audiobook-quality speech.
|
| 529 |
+
|
| 530 |
+
⭐ = Core language (best quality) · Others = Extended support
|
| 531 |
+
"""
|
| 532 |
+
|
| 533 |
+
# Language dropdown choices
|
| 534 |
+
lang_choices = []
|
| 535 |
+
lang_choices.append("── Core Languages (Best Quality) ──")
|
| 536 |
+
for name, cfg in LANGUAGES.items():
|
| 537 |
+
if cfg["tier"] == "core":
|
| 538 |
+
lang_choices.append(f"⭐ {name}")
|
| 539 |
+
lang_choices.append("── Extended Languages ──")
|
| 540 |
+
for name, cfg in LANGUAGES.items():
|
| 541 |
+
if cfg["tier"] == "extended":
|
| 542 |
+
lang_choices.append(name)
|
| 543 |
+
|
| 544 |
+
with gr.Blocks(
|
| 545 |
+
title="Audiobook Generator — Qwen3.5-Omni",
|
| 546 |
+
theme=gr.themes.Soft(
|
| 547 |
+
primary_hue="indigo",
|
| 548 |
+
secondary_hue="purple",
|
| 549 |
+
neutral_hue="slate",
|
| 550 |
+
),
|
| 551 |
+
) as demo:
|
| 552 |
+
|
| 553 |
+
gr.Markdown(DESCRIPTION)
|
| 554 |
+
|
| 555 |
+
with gr.Row():
|
| 556 |
+
# ── Left column: Input ──
|
| 557 |
+
with gr.Column(scale=1):
|
| 558 |
+
text_input = gr.Textbox(
|
| 559 |
+
label="English Text",
|
| 560 |
+
placeholder="Paste your English text here...",
|
| 561 |
+
lines=12,
|
| 562 |
+
max_lines=30,
|
| 563 |
+
)
|
| 564 |
+
|
| 565 |
+
file_input = gr.File(
|
| 566 |
+
label="Or Upload a Text File (.txt, .md)",
|
| 567 |
+
file_types=[".txt", ".md", ".text"],
|
| 568 |
+
type="filepath",
|
| 569 |
+
)
|
| 570 |
+
|
| 571 |
+
sample_btn = gr.Button("📄 Load Sample Text", variant="secondary", size="sm")
|
| 572 |
+
|
| 573 |
+
with gr.Row():
|
| 574 |
+
target_lang = gr.Dropdown(
|
| 575 |
+
choices=[c for c in lang_choices if not c.startswith("──")],
|
| 576 |
+
value="⭐ English",
|
| 577 |
+
label="Target Language",
|
| 578 |
+
info="⭐ = Core (best quality). Choose English for no translation.",
|
| 579 |
+
)
|
| 580 |
+
|
| 581 |
+
voice_select = gr.Dropdown(
|
| 582 |
+
choices=ALL_VOICES,
|
| 583 |
+
value="Jennifer — Cinematic narrator",
|
| 584 |
+
label="Narrator Voice",
|
| 585 |
+
)
|
| 586 |
+
|
| 587 |
+
add_pauses = gr.Checkbox(
|
| 588 |
+
value=True,
|
| 589 |
+
label="Add pauses between sections",
|
| 590 |
+
info="Adds 1.5s silence between text chunks for natural pacing",
|
| 591 |
+
)
|
| 592 |
+
|
| 593 |
+
generate_btn = gr.Button(
|
| 594 |
+
"🎙️ Generate Audiobook",
|
| 595 |
+
variant="primary",
|
| 596 |
+
size="lg",
|
| 597 |
+
)
|
| 598 |
+
|
| 599 |
+
# ── Right column: Output ──
|
| 600 |
+
with gr.Column(scale=1):
|
| 601 |
+
audio_output = gr.Audio(
|
| 602 |
+
label="Generated Audiobook",
|
| 603 |
+
type="filepath",
|
| 604 |
+
)
|
| 605 |
+
|
| 606 |
+
stats_output = gr.Markdown(label="Generation Stats")
|
| 607 |
+
|
| 608 |
+
with gr.Accordion("Translation / Narration Transcript", open=False):
|
| 609 |
+
transcript_output = gr.Markdown()
|
| 610 |
+
|
| 611 |
+
# ── Event handlers ──
|
| 612 |
+
sample_btn.click(
|
| 613 |
+
fn=lambda: SAMPLE_TEXT,
|
| 614 |
+
outputs=text_input,
|
| 615 |
+
)
|
| 616 |
+
|
| 617 |
+
generate_btn.click(
|
| 618 |
+
fn=generate_wrapper,
|
| 619 |
+
inputs=[text_input, file_input, target_lang, voice_select, add_pauses],
|
| 620 |
+
outputs=[audio_output, stats_output, transcript_output],
|
| 621 |
+
)
|
| 622 |
+
|
| 623 |
+
# ── Footer ──
|
| 624 |
+
gr.Markdown(
|
| 625 |
+
"---\n"
|
| 626 |
+
"**How it works:** Your text is split into chunks, each sent to Qwen3.5-Omni-Plus "
|
| 627 |
+
"for translation (if needed) + speech synthesis, then assembled into a single MP3 audiobook.\n\n"
|
| 628 |
+
"**Supported languages (36):** Arabic, Bengali, Cantonese, Chinese, Czech, Danish, Dutch, "
|
| 629 |
+
"English, Filipino, Finnish, French, German, Greek, Hebrew, Hindi, Hungarian, Indonesian, "
|
| 630 |
+
"Italian, Japanese, Korean, Malay, Norwegian, Persian, Polish, Portuguese, Romanian, Russian, "
|
| 631 |
+
"Spanish, Swahili, Swedish, Tamil, Thai, Turkish, Ukrainian, Urdu, Vietnamese\n\n"
|
| 632 |
+
"Built with [Gradio](https://gradio.app) · Model by [Alibaba Qwen](https://qwen.ai) · "
|
| 633 |
+
"API via [DashScope](https://www.alibabacloud.com/help/en/model-studio/)"
|
| 634 |
+
)
|
| 635 |
+
|
| 636 |
+
if __name__ == "__main__":
|
| 637 |
+
demo.launch()
|
requirements.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openai>=1.52.0
|
| 2 |
+
gradio>=5.0.0
|