PlotweaverModel commited on
Commit
cda6c2f
·
verified ·
1 Parent(s): f2f44f7

Files upload

Browse files
Files changed (2) hide show
  1. app.py +637 -0
  2. requirements.txt +2 -0
app.py ADDED
@@ -0,0 +1,637 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 📖 Audiobook Generator — English Source to Multi-Language Audio
3
+ Powered by Qwen3.5-Omni-Plus via DashScope API
4
+
5
+ Two modes:
6
+ 1. Translation + TTS: Translate English text to target language, then generate speech
7
+ 2. Direct TTS: Generate speech from English text directly
8
+
9
+ Deploy as a Hugging Face Space:
10
+ 1. Create a new Space (SDK: Gradio)
11
+ 2. Upload app.py and requirements.txt
12
+ 3. Add DASHSCOPE_API_KEY as a Space Secret
13
+ """
14
+
15
+ import os
16
+ import base64
17
+ import math
18
+ import shutil
19
+ import struct
20
+ import subprocess
21
+ import tempfile
22
+ import time
23
+ import re
24
+
25
+ import gradio as gr
26
+ from openai import OpenAI
27
+
28
+ # ──────────────────────────────────────────────
29
+ # Configuration
30
+ # ──────────────────────────────────────────────
31
+ MODEL = "qwen3.5-omni-plus"
32
+ BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
33
+
34
+ # Maximum characters per chunk sent to the API
35
+ # The model has token limits, so we split long texts
36
+ MAX_CHARS_PER_CHUNK = 1500
37
+
38
+ # All 36 speech output languages supported by Qwen3.5-Omni
39
+ # Core 10 languages have the best quality; extended languages are supported
40
+ # but may vary in quality as they include dialects
41
+ LANGUAGES = {
42
+ # ── Core 10 Languages (highest quality) ──
43
+ "English": {"code": "en", "native": "English", "tier": "core"},
44
+ "Chinese (Mandarin)": {"code": "zh", "native": "中文", "tier": "core"},
45
+ "Japanese": {"code": "ja", "native": "日本語", "tier": "core"},
46
+ "Korean": {"code": "ko", "native": "한국어", "tier": "core"},
47
+ "German": {"code": "de", "native": "Deutsch", "tier": "core"},
48
+ "French": {"code": "fr", "native": "Français", "tier": "core"},
49
+ "Russian": {"code": "ru", "native": "Русский", "tier": "core"},
50
+ "Portuguese": {"code": "pt", "native": "Português", "tier": "core"},
51
+ "Spanish": {"code": "es", "native": "Español", "tier": "core"},
52
+ "Italian": {"code": "it", "native": "Italiano", "tier": "core"},
53
+ # ── Extended Languages (Qwen3.5-Omni expanded to 36) ──
54
+ "Arabic": {"code": "ar", "native": "العربية", "tier": "extended"},
55
+ "Dutch": {"code": "nl", "native": "Nederlands", "tier": "extended"},
56
+ "Polish": {"code": "pl", "native": "Polski", "tier": "extended"},
57
+ "Turkish": {"code": "tr", "native": "Türkçe", "tier": "extended"},
58
+ "Vietnamese": {"code": "vi", "native": "Tiếng Việt", "tier": "extended"},
59
+ "Thai": {"code": "th", "native": "ภาษาไทย", "tier": "extended"},
60
+ "Indonesian": {"code": "id", "native": "Bahasa Indonesia", "tier": "extended"},
61
+ "Malay": {"code": "ms", "native": "Bahasa Melayu", "tier": "extended"},
62
+ "Hindi": {"code": "hi", "native": "हिन्दी", "tier": "extended"},
63
+ "Bengali": {"code": "bn", "native": "বাংলা", "tier": "extended"},
64
+ "Urdu": {"code": "ur", "native": "اردو", "tier": "extended"},
65
+ "Swedish": {"code": "sv", "native": "Svenska", "tier": "extended"},
66
+ "Czech": {"code": "cs", "native": "Čeština", "tier": "extended"},
67
+ "Romanian": {"code": "ro", "native": "Română", "tier": "extended"},
68
+ "Greek": {"code": "el", "native": "Ελληνικά", "tier": "extended"},
69
+ "Hungarian": {"code": "hu", "native": "Magyar", "tier": "extended"},
70
+ "Finnish": {"code": "fi", "native": "Suomi", "tier": "extended"},
71
+ "Danish": {"code": "da", "native": "Dansk", "tier": "extended"},
72
+ "Norwegian": {"code": "no", "native": "Norsk", "tier": "extended"},
73
+ "Ukrainian": {"code": "uk", "native": "Українська", "tier": "extended"},
74
+ "Hebrew": {"code": "he", "native": "עברית", "tier": "extended"},
75
+ "Persian": {"code": "fa", "native": "فارسی", "tier": "extended"},
76
+ "Cantonese": {"code": "yue", "native": "粵語", "tier": "extended"},
77
+ "Filipino": {"code": "fil", "native": "Filipino", "tier": "extended"},
78
+ "Swahili": {"code": "sw", "native": "Kiswahili", "tier": "extended"},
79
+ "Tamil": {"code": "ta", "native": "தமிழ்", "tier": "extended"},
80
+ }
81
+
82
+ VOICES = {
83
+ "Male Voices": [
84
+ "Ethan — Warm, energetic",
85
+ "Ryan — Dramatic, rhythmic",
86
+ "Kai — Soothing, calm",
87
+ "Neil — Precise, clear",
88
+ "Lenn — Rational, steady",
89
+ "Aiden — Young, lively",
90
+ "Eldric Sage — Authoritative narrator",
91
+ "Arthur — Classic, mature",
92
+ "Elias — Soft, thoughtful",
93
+ "Alek — Confident, modern",
94
+ "Andre — Deep, resonant",
95
+ "Emilien — Gentle, French-inspired",
96
+ "Vincent — Rich, theatrical",
97
+ ],
98
+ "Female Voices": [
99
+ "Cherry — Sunny, friendly",
100
+ "Serena — Gentle, soft",
101
+ "Jennifer — Cinematic narrator",
102
+ "Katerina — Mature, rich rhythm",
103
+ "Chelsie — Bright, expressive",
104
+ "Mia — Young, versatile",
105
+ "Bella — Elegant, warm",
106
+ "Vivian — Professional, clear",
107
+ "Moon — Dreamy, ethereal",
108
+ "Maia — Confident, articulate",
109
+ "Seren — Calm, measured",
110
+ "Dolce — Sweet, melodic",
111
+ "Bellona — Strong, commanding",
112
+ "Bunny — Playful, light",
113
+ "Momo — Cute, upbeat",
114
+ "Mochi — Soft, adorable",
115
+ ],
116
+ }
117
+
118
+ # Flatten voice list for the dropdown
119
+ ALL_VOICES = []
120
+ for category, voices in VOICES.items():
121
+ for v in voices:
122
+ ALL_VOICES.append(v)
123
+
124
+
125
+ def get_voice_name(voice_label: str) -> str:
126
+ """Extract just the voice name from 'Name — Description' format."""
127
+ return voice_label.split("—")[0].strip()
128
+
129
+
130
+ # ──────────────────────────────────────────────
131
+ # Audio helpers
132
+ # ──────────────────────────────────────────────
133
+ def base64_to_wav(b64_data: str, output_path: str):
134
+ """Decode base64 PCM data and write a proper WAV file."""
135
+ audio_bytes = base64.b64decode(b64_data)
136
+ sample_rate = 24000
137
+ num_channels = 1
138
+ bits_per_sample = 16
139
+ byte_rate = sample_rate * num_channels * bits_per_sample // 8
140
+ block_align = num_channels * bits_per_sample // 8
141
+ data_size = len(audio_bytes)
142
+ with open(output_path, "wb") as f:
143
+ f.write(b"RIFF")
144
+ f.write(struct.pack("<I", 36 + data_size))
145
+ f.write(b"WAVE")
146
+ f.write(b"fmt ")
147
+ f.write(struct.pack("<I", 16))
148
+ f.write(struct.pack("<H", 1))
149
+ f.write(struct.pack("<H", num_channels))
150
+ f.write(struct.pack("<I", sample_rate))
151
+ f.write(struct.pack("<I", byte_rate))
152
+ f.write(struct.pack("<H", block_align))
153
+ f.write(struct.pack("<H", bits_per_sample))
154
+ f.write(b"data")
155
+ f.write(struct.pack("<I", data_size))
156
+ f.write(audio_bytes)
157
+
158
+
159
+ def concatenate_wavs(wav_files: list, output_path: str):
160
+ """Concatenate multiple WAV files using ffmpeg."""
161
+ if not wav_files:
162
+ return
163
+ if len(wav_files) == 1:
164
+ shutil.copy2(wav_files[0], output_path)
165
+ return
166
+ list_file = output_path + ".txt"
167
+ with open(list_file, "w") as f:
168
+ for wav in wav_files:
169
+ f.write(f"file '{wav}'\n")
170
+ subprocess.run(
171
+ ["ffmpeg", "-y", "-f", "concat", "-safe", "0",
172
+ "-i", list_file, "-c", "copy", output_path],
173
+ capture_output=True, check=True,
174
+ )
175
+ os.remove(list_file)
176
+
177
+
178
+ # ──────────────────────────────────────────────
179
+ # Text splitting
180
+ # ──────────────────────────────────────────────
181
+ def split_text_into_chunks(text: str, max_chars: int = MAX_CHARS_PER_CHUNK) -> list:
182
+ """
183
+ Split text into chunks at sentence boundaries.
184
+ Tries to keep paragraphs together when possible.
185
+ """
186
+ # Normalize whitespace
187
+ text = text.strip()
188
+ if not text:
189
+ return []
190
+
191
+ # If short enough, return as-is
192
+ if len(text) <= max_chars:
193
+ return [text]
194
+
195
+ chunks = []
196
+ # First split by paragraphs
197
+ paragraphs = re.split(r"\n\s*\n", text)
198
+
199
+ current_chunk = ""
200
+ for para in paragraphs:
201
+ para = para.strip()
202
+ if not para:
203
+ continue
204
+
205
+ # If adding this paragraph keeps us under the limit
206
+ if len(current_chunk) + len(para) + 2 <= max_chars:
207
+ current_chunk = (current_chunk + "\n\n" + para).strip()
208
+ else:
209
+ # Save current chunk if it has content
210
+ if current_chunk:
211
+ chunks.append(current_chunk)
212
+ current_chunk = ""
213
+
214
+ # If the paragraph itself is too long, split by sentences
215
+ if len(para) > max_chars:
216
+ sentences = re.split(r"(?<=[.!?])\s+", para)
217
+ for sentence in sentences:
218
+ if len(current_chunk) + len(sentence) + 1 <= max_chars:
219
+ current_chunk = (current_chunk + " " + sentence).strip()
220
+ else:
221
+ if current_chunk:
222
+ chunks.append(current_chunk)
223
+ # If a single sentence is too long, force-split it
224
+ if len(sentence) > max_chars:
225
+ words = sentence.split()
226
+ current_chunk = ""
227
+ for word in words:
228
+ if len(current_chunk) + len(word) + 1 <= max_chars:
229
+ current_chunk = (current_chunk + " " + word).strip()
230
+ else:
231
+ if current_chunk:
232
+ chunks.append(current_chunk)
233
+ current_chunk = word
234
+ else:
235
+ current_chunk = sentence
236
+ else:
237
+ current_chunk = para
238
+
239
+ if current_chunk:
240
+ chunks.append(current_chunk)
241
+
242
+ return chunks
243
+
244
+
245
+ # ──────────────────────────────────────────────
246
+ # API: Generate speech for a text chunk
247
+ # ──────────────────────────────────────────────
248
+ def generate_speech_chunk(
249
+ client: OpenAI,
250
+ text: str,
251
+ voice: str,
252
+ language: str,
253
+ lang_config: dict,
254
+ translate: bool,
255
+ chunk_index: int,
256
+ output_dir: str,
257
+ ) -> tuple:
258
+ """
259
+ Send a text chunk to Qwen3.5-Omni-Plus and get back audio.
260
+ If translate=True, translates from English to target language and speaks.
261
+ If translate=False, speaks the text directly in English.
262
+ Returns (wav_path, transcript) or (None, error_msg).
263
+ """
264
+ output_wav = os.path.join(output_dir, f"chunk_{chunk_index:04d}.wav")
265
+
266
+ if translate and language != "English":
267
+ system_prompt = (
268
+ f"You are a professional audiobook narrator and translator.\n"
269
+ f"You will receive English text. Your task:\n"
270
+ f"1. Translate the text into natural, fluent {language} ({lang_config['native']}).\n"
271
+ f"2. Read the translated text aloud with clear, expressive narration.\n"
272
+ f"3. Use an engaging audiobook narration style — vary your tone for dialogue,\n"
273
+ f" descriptions, and emotional moments.\n"
274
+ f"4. Respond ONLY with the spoken {language} narration — no English,\n"
275
+ f" no meta-commentary, no chapter headers unless they're in the text.\n"
276
+ f"5. Maintain a natural reading pace suitable for an audiobook.\n"
277
+ f"6. Translate idioms and cultural references appropriately."
278
+ )
279
+ user_text = (
280
+ f"Translate the following English text into {language} and narrate it "
281
+ f"as an audiobook. Respond only with the spoken {language} narration:\n\n{text}"
282
+ )
283
+ else:
284
+ system_prompt = (
285
+ "You are a professional audiobook narrator.\n"
286
+ "You will receive text to read aloud. Your task:\n"
287
+ "1. Read the text with clear, expressive narration.\n"
288
+ "2. Use an engaging audiobook narration style — vary your tone for dialogue,\n"
289
+ " descriptions, and emotional moments.\n"
290
+ "3. Respond ONLY with the spoken narration — no meta-commentary.\n"
291
+ "4. Maintain a natural reading pace suitable for an audiobook.\n"
292
+ "5. Pause appropriately between paragraphs and at punctuation."
293
+ )
294
+ user_text = f"Narrate the following text as an audiobook:\n\n{text}"
295
+
296
+ try:
297
+ completion = client.chat.completions.create(
298
+ model=MODEL,
299
+ messages=[
300
+ {"role": "system", "content": system_prompt},
301
+ {"role": "user", "content": user_text},
302
+ ],
303
+ modalities=["text", "audio"],
304
+ audio={"voice": voice, "format": "wav"},
305
+ stream=True,
306
+ stream_options={"include_usage": True},
307
+ )
308
+
309
+ audio_chunks = []
310
+ transcript_parts = []
311
+
312
+ for event in completion:
313
+ if not event.choices:
314
+ continue
315
+ delta = event.choices[0].delta
316
+ if hasattr(delta, "content") and delta.content:
317
+ transcript_parts.append(delta.content)
318
+ if hasattr(delta, "audio") and delta.audio:
319
+ if isinstance(delta.audio, dict):
320
+ if "data" in delta.audio:
321
+ audio_chunks.append(delta.audio["data"])
322
+ elif hasattr(delta.audio, "data") and delta.audio.data:
323
+ audio_chunks.append(delta.audio.data)
324
+
325
+ transcript = "".join(transcript_parts)
326
+
327
+ if audio_chunks:
328
+ full_audio_b64 = "".join(audio_chunks)
329
+ base64_to_wav(full_audio_b64, output_wav)
330
+ return output_wav, transcript
331
+ else:
332
+ return None, "No audio received from API"
333
+
334
+ except Exception as e:
335
+ return None, str(e)
336
+
337
+
338
+ # ──────────────────────────────────────────────
339
+ # Generate silence between chapters/sections
340
+ # ──────────────────────────────────────────────
341
+ def generate_silence(duration_sec: float, output_path: str):
342
+ """Generate a silent WAV file."""
343
+ subprocess.run(
344
+ ["ffmpeg", "-y", "-f", "lavfi",
345
+ "-i", f"anullsrc=r=24000:cl=mono",
346
+ "-t", str(duration_sec), "-acodec", "pcm_s16le", output_path],
347
+ capture_output=True, check=True,
348
+ )
349
+
350
+
351
+ # ──────────────────────────────────────────��───
352
+ # Main pipeline
353
+ # ──────────────────────────────────────────────
354
+ def generate_audiobook(
355
+ text_input: str,
356
+ file_input,
357
+ target_language: str,
358
+ voice_label: str,
359
+ add_pauses: bool,
360
+ progress=gr.Progress(),
361
+ ):
362
+ """Main audiobook generation pipeline."""
363
+
364
+ # ── Resolve text source ──
365
+ if file_input is not None:
366
+ try:
367
+ with open(file_input, "r", encoding="utf-8", errors="replace") as f:
368
+ text = f.read()
369
+ except Exception as e:
370
+ raise gr.Error(f"Failed to read file: {e}")
371
+ elif text_input and text_input.strip():
372
+ text = text_input.strip()
373
+ else:
374
+ raise gr.Error("Please provide text or upload a file.")
375
+
376
+ if len(text) < 10:
377
+ raise gr.Error("Text is too short. Please provide more content.")
378
+
379
+ # ── API key ──
380
+ api_key = os.environ.get("DASHSCOPE_API_KEY", "")
381
+ if not api_key:
382
+ raise gr.Error(
383
+ "DASHSCOPE_API_KEY not set. Add it as a Space Secret "
384
+ "(Settings → Secrets → New Secret)."
385
+ )
386
+
387
+ voice = get_voice_name(voice_label)
388
+ lang_config = LANGUAGES[target_language]
389
+ translate = target_language != "English"
390
+ client = OpenAI(api_key=api_key, base_url=BASE_URL)
391
+ tmp_dir = tempfile.mkdtemp(prefix="audiobook_")
392
+
393
+ try:
394
+ # ── Split text ──
395
+ progress(0.05, desc="Splitting text into chunks...")
396
+ chunks = split_text_into_chunks(text)
397
+ total_chunks = len(chunks)
398
+ total_chars = sum(len(c) for c in chunks)
399
+
400
+ progress(0.08, desc=f"Processing {total_chunks} chunks ({total_chars:,} characters)...")
401
+
402
+ # ── Generate speech for each chunk ──
403
+ audio_files = []
404
+ all_transcripts = []
405
+ silence_path = os.path.join(tmp_dir, "silence.wav")
406
+ if add_pauses:
407
+ generate_silence(1.5, silence_path)
408
+
409
+ for i, chunk in enumerate(chunks):
410
+ frac = 0.1 + 0.8 * (i / total_chunks)
411
+ progress(frac, desc=f"Narrating chunk {i+1}/{total_chunks}...")
412
+
413
+ wav_path, transcript = generate_speech_chunk(
414
+ client, chunk, voice, target_language,
415
+ lang_config, translate, i, tmp_dir,
416
+ )
417
+
418
+ if wav_path:
419
+ audio_files.append(wav_path)
420
+ # Add pause between chunks
421
+ if add_pauses and i < total_chunks - 1:
422
+ audio_files.append(silence_path)
423
+ else:
424
+ all_transcripts.append(f"⚠️ Chunk {i+1} failed: {transcript}")
425
+ # Insert silence placeholder for failed chunk
426
+ fail_silence = os.path.join(tmp_dir, f"fail_silence_{i:04d}.wav")
427
+ generate_silence(2.0, fail_silence)
428
+ audio_files.append(fail_silence)
429
+
430
+ if transcript and not transcript.startswith("⚠️"):
431
+ all_transcripts.append(transcript)
432
+
433
+ if not audio_files:
434
+ raise gr.Error("No audio was generated. Check your API key and try again.")
435
+
436
+ # ── Concatenate all audio ──
437
+ progress(0.92, desc="Assembling audiobook...")
438
+ final_audio = os.path.join(tmp_dir, "audiobook.wav")
439
+ concatenate_wavs(audio_files, final_audio)
440
+
441
+ # ── Convert to MP3 for smaller file size ──
442
+ progress(0.96, desc="Converting to MP3...")
443
+ final_mp3 = os.path.join(tmp_dir, "audiobook.mp3")
444
+ subprocess.run(
445
+ ["ffmpeg", "-y", "-i", final_audio,
446
+ "-codec:a", "libmp3lame", "-b:a", "128k",
447
+ "-ar", "24000", "-ac", "1", final_mp3],
448
+ capture_output=True, check=True,
449
+ )
450
+
451
+ progress(1.0, desc="Done!")
452
+
453
+ # Build transcript display
454
+ transcript_text = "\n\n---\n\n".join(all_transcripts) if all_transcripts else ""
455
+
456
+ # Stats
457
+ audio_size = os.path.getsize(final_mp3) / (1024 * 1024)
458
+ stats = (
459
+ f"**Audiobook Generated!**\n\n"
460
+ f"- **Source:** {total_chars:,} characters in {total_chunks} chunks\n"
461
+ f"- **Language:** {target_language} ({lang_config['native']})\n"
462
+ f"- **Voice:** {voice_label}\n"
463
+ f"- **File size:** {audio_size:.1f} MB\n"
464
+ f"- **Quality tier:** {lang_config['tier'].title()}\n"
465
+ )
466
+ if lang_config["tier"] == "extended":
467
+ stats += "\n> ⚠️ This is an extended language. Voice quality may vary compared to the core 10 languages."
468
+
469
+ return final_mp3, stats, transcript_text
470
+
471
+ except gr.Error:
472
+ raise
473
+ except Exception as e:
474
+ raise gr.Error(f"Pipeline error: {str(e)}")
475
+ finally:
476
+ # Don't clean up tmp_dir yet — Gradio needs the files
477
+ pass
478
+
479
+
480
+ # ──────────────────────────────────────────────
481
+ # Build language choices with tier labels
482
+ # ──────────────────────────────────────────────
483
+ def get_language_choices():
484
+ core = [f"⭐ {name}" for name, cfg in LANGUAGES.items() if cfg["tier"] == "core"]
485
+ extended = [f" {name}" for name, cfg in LANGUAGES.items() if cfg["tier"] == "extended"]
486
+ return core + extended
487
+
488
+
489
+ def clean_language_name(choice: str) -> str:
490
+ """Remove the tier prefix from the dropdown choice."""
491
+ return choice.replace("⭐ ", "").replace(" ", "").strip()
492
+
493
+
494
+ def generate_wrapper(text_input, file_input, language_choice, voice, add_pauses, progress=gr.Progress()):
495
+ language = clean_language_name(language_choice)
496
+ return generate_audiobook(text_input, file_input, language, voice, add_pauses, progress)
497
+
498
+
499
+ # ──────────────────────────────────────────────
500
+ # Sample text
501
+ # ──────────────────────────────────────────────
502
+ SAMPLE_TEXT = """Chapter 1: The Beginning
503
+
504
+ The old lighthouse stood at the edge of the world, or so it seemed to the girl who had lived in its shadow all her life. Each morning, she would climb the winding iron staircase to the lamp room, counting exactly one hundred and forty-seven steps, and watch the sun rise from the sea like a great golden coin tossed by the gods.
505
+
506
+ "One day," she whispered to the seagulls that perched on the railing, "I'll follow that sun to wherever it goes."
507
+
508
+ The gulls, as always, said nothing. They merely tilted their heads and regarded her with ancient, knowing eyes before launching themselves into the wind.
509
+
510
+ Her name was Elena, and she was seventeen years old. She had hair the color of dark honey and eyes that changed with the weather — grey in storms, green in sunlight, and something altogether different in the strange purple twilight that sometimes settled over the coast in autumn.
511
+
512
+ The lighthouse keeper, her grandfather, was a man of few words but many stories. He kept them locked away like treasures in a chest, only bringing them out on winter nights when the storms howled outside and the old building trembled like a living thing.
513
+
514
+ "Tell me about the ships," Elena would say, curling up in the worn armchair by the fire.
515
+
516
+ And he would smile — that slow, careful smile that seemed to cost him something each time — and begin."""
517
+
518
+
519
+ # ──────────────────────────────────────────────
520
+ # Gradio UI
521
+ # ──────────────────────────────────────────────
522
+ DESCRIPTION = """
523
+ # 📖 Audiobook Generator
524
+ ### English Text → Multi-Language Audiobook
525
+ **Powered by Qwen3.5-Omni-Plus**
526
+
527
+ Paste or upload English text and get a professionally narrated audiobook in any of **36 languages**.
528
+ The AI translates and narrates with expressive, audiobook-quality speech.
529
+
530
+ ⭐ = Core language (best quality) · Others = Extended support
531
+ """
532
+
533
+ # Language dropdown choices
534
+ lang_choices = []
535
+ lang_choices.append("── Core Languages (Best Quality) ──")
536
+ for name, cfg in LANGUAGES.items():
537
+ if cfg["tier"] == "core":
538
+ lang_choices.append(f"⭐ {name}")
539
+ lang_choices.append("── Extended Languages ──")
540
+ for name, cfg in LANGUAGES.items():
541
+ if cfg["tier"] == "extended":
542
+ lang_choices.append(name)
543
+
544
+ with gr.Blocks(
545
+ title="Audiobook Generator — Qwen3.5-Omni",
546
+ theme=gr.themes.Soft(
547
+ primary_hue="indigo",
548
+ secondary_hue="purple",
549
+ neutral_hue="slate",
550
+ ),
551
+ ) as demo:
552
+
553
+ gr.Markdown(DESCRIPTION)
554
+
555
+ with gr.Row():
556
+ # ── Left column: Input ──
557
+ with gr.Column(scale=1):
558
+ text_input = gr.Textbox(
559
+ label="English Text",
560
+ placeholder="Paste your English text here...",
561
+ lines=12,
562
+ max_lines=30,
563
+ )
564
+
565
+ file_input = gr.File(
566
+ label="Or Upload a Text File (.txt, .md)",
567
+ file_types=[".txt", ".md", ".text"],
568
+ type="filepath",
569
+ )
570
+
571
+ sample_btn = gr.Button("📄 Load Sample Text", variant="secondary", size="sm")
572
+
573
+ with gr.Row():
574
+ target_lang = gr.Dropdown(
575
+ choices=[c for c in lang_choices if not c.startswith("──")],
576
+ value="⭐ English",
577
+ label="Target Language",
578
+ info="⭐ = Core (best quality). Choose English for no translation.",
579
+ )
580
+
581
+ voice_select = gr.Dropdown(
582
+ choices=ALL_VOICES,
583
+ value="Jennifer — Cinematic narrator",
584
+ label="Narrator Voice",
585
+ )
586
+
587
+ add_pauses = gr.Checkbox(
588
+ value=True,
589
+ label="Add pauses between sections",
590
+ info="Adds 1.5s silence between text chunks for natural pacing",
591
+ )
592
+
593
+ generate_btn = gr.Button(
594
+ "🎙️ Generate Audiobook",
595
+ variant="primary",
596
+ size="lg",
597
+ )
598
+
599
+ # ── Right column: Output ──
600
+ with gr.Column(scale=1):
601
+ audio_output = gr.Audio(
602
+ label="Generated Audiobook",
603
+ type="filepath",
604
+ )
605
+
606
+ stats_output = gr.Markdown(label="Generation Stats")
607
+
608
+ with gr.Accordion("Translation / Narration Transcript", open=False):
609
+ transcript_output = gr.Markdown()
610
+
611
+ # ── Event handlers ──
612
+ sample_btn.click(
613
+ fn=lambda: SAMPLE_TEXT,
614
+ outputs=text_input,
615
+ )
616
+
617
+ generate_btn.click(
618
+ fn=generate_wrapper,
619
+ inputs=[text_input, file_input, target_lang, voice_select, add_pauses],
620
+ outputs=[audio_output, stats_output, transcript_output],
621
+ )
622
+
623
+ # ── Footer ──
624
+ gr.Markdown(
625
+ "---\n"
626
+ "**How it works:** Your text is split into chunks, each sent to Qwen3.5-Omni-Plus "
627
+ "for translation (if needed) + speech synthesis, then assembled into a single MP3 audiobook.\n\n"
628
+ "**Supported languages (36):** Arabic, Bengali, Cantonese, Chinese, Czech, Danish, Dutch, "
629
+ "English, Filipino, Finnish, French, German, Greek, Hebrew, Hindi, Hungarian, Indonesian, "
630
+ "Italian, Japanese, Korean, Malay, Norwegian, Persian, Polish, Portuguese, Romanian, Russian, "
631
+ "Spanish, Swahili, Swedish, Tamil, Thai, Turkish, Ukrainian, Urdu, Vietnamese\n\n"
632
+ "Built with [Gradio](https://gradio.app) · Model by [Alibaba Qwen](https://qwen.ai) · "
633
+ "API via [DashScope](https://www.alibabacloud.com/help/en/model-studio/)"
634
+ )
635
+
636
+ if __name__ == "__main__":
637
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ openai>=1.52.0
2
+ gradio>=5.0.0