PlotweaverModel commited on
Commit
0a5456e
·
verified ·
1 Parent(s): e2d9fde

Upload 2 files

Browse files
Files changed (1) hide show
  1. app.py +475 -456
app.py CHANGED
@@ -1,22 +1,19 @@
1
  """
2
  Audiobook Generator - English Source to Multi-Language Audio
3
- Powered by Qwen3.5-Omni-Plus + Qwen3-TTS-VC via DashScope API
4
-
5
- Three voice modes:
6
- 1. Preset Voices: Use built-in Qwen voices (via Qwen3.5-Omni-Plus)
7
- 2. Cloned Voice: Clone a voice from audio sample (via Qwen3-TTS-VC)
8
- 3. Both support translation from English to 36 languages
9
 
10
  Deploy as a Hugging Face Space:
11
  1. Create a new Space (SDK: Gradio)
12
  2. Upload app.py and requirements.txt
13
- 3. Add DASHSCOPE_API_KEY as a Space Secret
14
  """
15
 
16
  import os
17
  import base64
18
  import json
19
- import math
20
  import pathlib
21
  import shutil
22
  import struct
@@ -29,7 +26,6 @@ import gradio as gr
29
  import requests as http_requests
30
  from openai import OpenAI
31
 
32
- # Optional document parsers
33
  try:
34
  import pypdf
35
  HAS_PYPDF = True
@@ -42,57 +38,126 @@ try:
42
  except ImportError:
43
  HAS_DOCX = False
44
 
45
- # Configuration
 
 
46
  OMNI_MODEL = "qwen3.5-omni-plus"
47
  TTS_VC_MODEL = "qwen3-tts-vc-2026-01-22"
48
  VOICE_CLONE_MODEL = "qwen-voice-enrollment"
49
 
50
- BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
51
  DASHSCOPE_API_URL = "https://dashscope-intl.aliyuncs.com/api/v1"
52
  VOICE_CLONE_URL = f"{DASHSCOPE_API_URL}/services/audio/tts/customization"
53
  TTS_SYNTHESIS_URL = f"{DASHSCOPE_API_URL}/services/aigc/multimodal-generation/generation"
54
 
 
 
 
 
55
  MAX_CHARS_PER_CHUNK = 1500
56
 
57
- # Languages
 
 
 
58
  LANGUAGES = {
59
- "English": {"code": "en", "native": "English", "tier": "core"},
60
- "Chinese (Mandarin)": {"code": "zh", "native": "Chinese", "tier": "core"},
61
- "Japanese": {"code": "ja", "native": "Japanese", "tier": "core"},
62
- "Korean": {"code": "ko", "native": "Korean", "tier": "core"},
63
- "German": {"code": "de", "native": "Deutsch", "tier": "core"},
64
- "French": {"code": "fr", "native": "Francais", "tier": "core"},
65
- "Russian": {"code": "ru", "native": "Russian", "tier": "core"},
66
- "Portuguese": {"code": "pt", "native": "Portugues", "tier": "core"},
67
- "Spanish": {"code": "es", "native": "Espanol", "tier": "core"},
68
- "Italian": {"code": "it", "native": "Italiano", "tier": "core"},
69
- "Arabic": {"code": "ar", "native": "Arabic", "tier": "extended"},
70
- "Dutch": {"code": "nl", "native": "Nederlands", "tier": "extended"},
71
- "Hindi": {"code": "hi", "native": "Hindi", "tier": "extended"},
72
- "Urdu": {"code": "ur", "native": "Urdu", "tier": "extended"},
73
- "Swahili": {"code": "sw", "native": "Kiswahili", "tier": "extended"},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  }
75
 
 
 
 
 
76
  VOICE_CLONE_LANGUAGES = {
77
  "English", "Chinese (Mandarin)", "Japanese", "Korean", "German",
78
  "French", "Russian", "Portuguese", "Spanish", "Italian",
79
  }
80
 
 
 
 
81
  PRESET_VOICES = [
82
- "Cherry -- Sunny, friendly",
83
- "Jennifer -- Cinematic narrator",
84
- "Katerina -- Mature, rich rhythm",
85
- "Ethan -- Warm, energetic",
86
- "Ryan -- Dramatic, rhythmic",
87
- "Kai -- Soothing, calm",
88
- "Aiden -- Young, lively",
89
- "Eldric Sage -- Authoritative narrator",
90
- "Arthur -- Classic, mature",
91
- "Bella -- Elegant, warm",
92
- "Vivian -- Professional, clear",
93
- "Seren -- Calm, measured",
94
- "Dolce -- Sweet, melodic",
95
- "Vincent -- Rich, theatrical",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  ]
97
 
98
 
@@ -100,12 +165,16 @@ def get_voice_name(label):
100
  return label.split("--")[0].strip()
101
 
102
 
103
- # Audio helpers
 
 
 
 
 
 
104
  def base64_to_wav(b64_data, output_path):
105
  audio_bytes = base64.b64decode(b64_data)
106
- sr = 24000
107
- nc = 1
108
- bps = 16
109
  br = sr * nc * bps // 8
110
  ba = nc * bps // 8
111
  ds = len(audio_bytes)
@@ -146,494 +215,423 @@ def concatenate_wavs(wav_files, output_path):
146
 
147
  def generate_silence(duration_sec, output_path):
148
  subprocess.run(
149
- ["ffmpeg", "-y", "-f", "lavfi",
150
- "-i", "anullsrc=r=24000:cl=mono",
151
  "-t", str(duration_sec), "-acodec", "pcm_s16le", output_path],
152
  capture_output=True, check=True,
153
  )
154
 
155
 
156
- # Document extraction
157
- def extract_text_from_pdf(filepath):
158
- if not HAS_PYPDF:
159
- raise ImportError("pypdf is not installed.")
160
- reader = pypdf.PdfReader(filepath)
161
- pages = []
162
- for page in reader.pages:
163
- text = page.extract_text()
164
- if text:
165
- pages.append(text.strip())
166
- return "\n\n".join(pages)
167
-
168
-
169
- def extract_text_from_docx(filepath):
170
- if not HAS_DOCX:
171
- raise ImportError("python-docx is not installed.")
172
- doc = docx.Document(filepath)
173
- paragraphs = []
174
- for para in doc.paragraphs:
175
- text = para.text.strip()
176
- if text:
177
- paragraphs.append(text)
178
- return "\n\n".join(paragraphs)
179
-
180
-
181
  def extract_text_from_file(filepath):
182
  ext = os.path.splitext(filepath)[1].lower()
183
  if ext == ".pdf":
184
- return extract_text_from_pdf(filepath)
 
 
 
185
  elif ext in (".docx", ".doc"):
186
  if ext == ".doc":
187
- try:
188
- tmp_dir = tempfile.mkdtemp()
189
- subprocess.run(
190
- ["libreoffice", "--headless", "--convert-to", "docx",
191
- "--outdir", tmp_dir, filepath],
192
- capture_output=True, check=True, timeout=60,
193
- )
194
- docx_name = os.path.splitext(os.path.basename(filepath))[0] + ".docx"
195
- docx_path = os.path.join(tmp_dir, docx_name)
196
- if os.path.exists(docx_path):
197
- text = extract_text_from_docx(docx_path)
198
- shutil.rmtree(tmp_dir, ignore_errors=True)
199
- return text
200
- except Exception:
201
- pass
202
- raise gr.Error("Cannot read .doc files. Please save as .docx or .pdf.")
203
- return extract_text_from_docx(filepath)
204
  else:
205
  with open(filepath, "r", encoding="utf-8", errors="replace") as f:
206
  return f.read()
207
 
208
 
209
- # Text splitting
 
 
210
  def split_text_into_chunks(text, max_chars=MAX_CHARS_PER_CHUNK):
211
  text = text.strip()
212
  if not text:
213
  return []
214
  if len(text) <= max_chars:
215
  return [text]
216
-
217
- chunks = []
218
- paragraphs = re.split(r"\n\s*\n", text)
219
- current_chunk = ""
220
-
221
  for para in paragraphs:
222
  para = para.strip()
223
  if not para:
224
  continue
225
- if len(current_chunk) + len(para) + 2 <= max_chars:
226
- current_chunk = (current_chunk + "\n\n" + para).strip()
227
  else:
228
- if current_chunk:
229
- chunks.append(current_chunk)
230
- current_chunk = ""
231
  if len(para) > max_chars:
232
  sentences = re.split(r"(?<=[.!?])\s+", para)
233
- for sentence in sentences:
234
- if len(current_chunk) + len(sentence) + 1 <= max_chars:
235
- current_chunk = (current_chunk + " " + sentence).strip()
 
236
  else:
237
- if current_chunk:
238
- chunks.append(current_chunk)
239
- if len(sentence) > max_chars:
240
- words = sentence.split()
241
- current_chunk = ""
242
- for word in words:
243
- if len(current_chunk) + len(word) + 1 <= max_chars:
244
- current_chunk = (current_chunk + " " + word).strip()
245
- else:
246
- if current_chunk:
247
- chunks.append(current_chunk)
248
- current_chunk = word
249
- else:
250
- current_chunk = sentence
251
  else:
252
- current_chunk = para
253
-
254
- if current_chunk:
255
- chunks.append(current_chunk)
256
  return chunks
257
 
258
 
259
- # ==============================
260
- # VOICE CLONING
261
- # ==============================
262
  def prepare_clone_audio(audio_path):
263
- """
264
- Prepare audio for voice cloning:
265
- - Accept 10s to 3min input
266
- - Trim to best 60s (API max) from the middle for voice consistency
267
- - Convert to mono WAV at 24kHz for best quality
268
- Returns path to the prepared file.
269
- """
270
- # Get duration
271
  result = subprocess.run(
272
  ["ffprobe", "-v", "quiet", "-show_entries", "format=duration",
273
  "-of", "default=noprint_wrappers=1:nokey=1", audio_path],
274
  capture_output=True, text=True,
275
  )
276
  duration = float(result.stdout.strip())
277
-
278
  if duration < 10:
279
- raise ValueError(
280
- f"Audio is too short ({duration:.1f}s). "
281
- f"Please provide at least 10 seconds of clear speech."
282
- )
283
-
284
- # If under 60s, just convert format; if over 60s, take the best 60s
285
- tmp_prepared = audio_path + "_prepared.wav"
286
-
287
  if duration <= 60:
288
- # Convert to proper format (mono, 24kHz, 16-bit WAV)
289
- subprocess.run(
290
- ["ffmpeg", "-y", "-i", audio_path,
291
- "-ar", "24000", "-ac", "1", "-acodec", "pcm_s16le",
292
- tmp_prepared],
293
- capture_output=True, check=True,
294
- )
295
  else:
296
- # Take 60s from 5s into the audio (skip intro silence/noise)
297
  start = min(5, duration - 60)
298
- subprocess.run(
299
- ["ffmpeg", "-y", "-ss", str(start), "-t", "60",
300
- "-i", audio_path,
301
- "-ar", "24000", "-ac", "1", "-acodec", "pcm_s16le",
302
- tmp_prepared],
303
- capture_output=True, check=True,
304
- )
305
-
306
- return tmp_prepared
307
 
308
 
309
- def clone_voice(audio_path, api_key, preferred_name="audiobook_voice"):
310
- # Prepare audio (trim if needed, convert format)
311
- prepared_path = prepare_clone_audio(audio_path)
312
-
313
- filepath = pathlib.Path(prepared_path)
314
- if not filepath.exists():
315
- raise FileNotFoundError(f"Prepared audio file not found: {prepared_path}")
316
-
317
- b64_str = base64.b64encode(filepath.read_bytes()).decode()
318
- data_uri = f"data:audio/wav;base64,{b64_str}"
319
-
320
- # Clean up prepared file
321
  try:
322
- os.remove(prepared_path)
323
  except OSError:
324
  pass
325
-
326
- payload = {
327
  "model": VOICE_CLONE_MODEL,
328
  "input": {
329
- "action": "create",
330
- "target_model": TTS_VC_MODEL,
331
- "preferred_name": preferred_name,
332
- "audio": {"data": data_uri},
333
  },
334
- }
335
- headers = {
336
- "Authorization": f"Bearer {api_key}",
337
- "Content-Type": "application/json",
338
- }
339
-
340
- resp = http_requests.post(VOICE_CLONE_URL, json=payload, headers=headers, timeout=60)
341
  if resp.status_code != 200:
342
- raise RuntimeError(f"Voice cloning failed ({resp.status_code}): {resp.text}")
343
-
344
- try:
345
- return resp.json()["output"]["voice"]
346
- except (KeyError, ValueError) as e:
347
- raise RuntimeError(f"Failed to parse voice clone response: {e}\n{resp.text}")
348
-
349
-
350
- # ==============================
351
- # TTS WITH CLONED VOICE
352
- # ==============================
353
- def synthesize_with_cloned_voice(text, voice_id, language, api_key, output_dir, chunk_index):
354
- lang_type_map = {
355
- "English": "English", "Chinese (Mandarin)": "Chinese",
356
- "Japanese": "Japanese", "Korean": "Korean",
357
- "German": "German", "French": "French",
358
- "Russian": "Russian", "Portuguese": "Portuguese",
359
- "Spanish": "Spanish", "Italian": "Italian",
360
- }
361
- language_type = lang_type_map.get(language, "English")
362
-
363
- payload = {
364
- "model": TTS_VC_MODEL,
365
- "input": {
366
- "text": text,
367
- "voice": voice_id,
368
- "language_type": language_type,
369
- },
370
- }
371
- headers = {
372
- "Authorization": f"Bearer {api_key}",
373
- "Content-Type": "application/json",
374
- }
375
 
376
- try:
377
- resp = http_requests.post(TTS_SYNTHESIS_URL, json=payload, headers=headers, timeout=120)
378
- if resp.status_code != 200:
379
- return None, f"TTS failed ({resp.status_code}): {resp.text[:200]}"
380
-
381
- result = resp.json()
382
- audio_url = result.get("output", {}).get("audio", {}).get("url")
383
- if not audio_url:
384
- return None, f"No audio URL in response: {json.dumps(result)[:200]}"
385
-
386
- output_wav = os.path.join(output_dir, f"vc_chunk_{chunk_index:04d}.wav")
387
- audio_resp = http_requests.get(audio_url, timeout=120)
388
- if audio_resp.status_code != 200:
389
- return None, "Failed to download audio from URL"
390
-
391
- with open(output_wav, "wb") as f:
392
- f.write(audio_resp.content)
393
-
394
- return output_wav, None
395
 
396
- except Exception as e:
397
- return None, str(e)
398
-
399
-
400
- # ==============================
401
- # TRANSLATION (text only)
402
- # ==============================
403
  def translate_text(client, text, target_language, lang_config):
404
  response = client.chat.completions.create(
405
- model=OMNI_MODEL,
406
- modalities=["text"],
407
  messages=[
408
- {
409
- "role": "system",
410
- "content": (
411
- f"You are a professional translator. Translate English text into "
412
- f"natural, fluent {target_language} ({lang_config['native']}). "
413
- f"Output ONLY the translated text."
414
- ),
415
- },
416
- {
417
- "role": "user",
418
- "content": f"Translate the following into {target_language}:\n\n{text}",
419
- },
420
  ],
421
  )
422
  return response.choices[0].message.content.strip()
423
 
424
 
425
- # ==============================
426
- # SPEECH WITH PRESET VOICE
427
- # ==============================
428
  def generate_speech_preset(client, text, voice, language, lang_config, translate, chunk_index, output_dir):
429
  output_wav = os.path.join(output_dir, f"chunk_{chunk_index:04d}.wav")
430
-
431
  if translate and language != "English":
432
- system_prompt = (
433
- f"You are a professional audiobook narrator and translator.\n"
434
- f"Translate the English text into natural {language} ({lang_config['native']}).\n"
435
- f"Read the translation aloud with expressive audiobook narration.\n"
436
- f"Respond ONLY with the spoken {language} narration."
437
- )
438
- user_text = f"Translate into {language} and narrate as an audiobook:\n\n{text}"
439
  else:
440
- system_prompt = (
441
- "You are a professional audiobook narrator.\n"
442
- "Read the text with clear, expressive narration.\n"
443
- "Respond ONLY with the spoken narration."
444
- )
445
- user_text = f"Narrate as an audiobook:\n\n{text}"
446
-
447
  try:
448
  completion = client.chat.completions.create(
449
  model=OMNI_MODEL,
450
- messages=[
451
- {"role": "system", "content": system_prompt},
452
- {"role": "user", "content": user_text},
453
- ],
454
- modalities=["text", "audio"],
455
- audio={"voice": voice, "format": "wav"},
456
- stream=True,
457
- stream_options={"include_usage": True},
458
  )
459
-
460
- audio_chunks = []
461
- transcript_parts = []
462
-
463
  for event in completion:
464
  if not event.choices:
465
  continue
466
  delta = event.choices[0].delta
467
  if hasattr(delta, "content") and delta.content:
468
- transcript_parts.append(delta.content)
469
  if hasattr(delta, "audio") and delta.audio:
470
- if isinstance(delta.audio, dict):
471
- if "data" in delta.audio:
472
- audio_chunks.append(delta.audio["data"])
473
  elif hasattr(delta.audio, "data") and delta.audio.data:
474
- audio_chunks.append(delta.audio.data)
475
-
476
- transcript = "".join(transcript_parts)
477
-
478
- if audio_chunks:
479
- full_audio_b64 = "".join(audio_chunks)
480
- base64_to_wav(full_audio_b64, output_wav)
481
  return output_wav, transcript
482
  return None, "No audio received"
483
-
484
  except Exception as e:
485
  return None, str(e)
486
 
487
 
488
- # ==============================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
489
  # MAIN PIPELINE
490
- # ==============================
491
  def generate_audiobook(text_input, file_input, target_language, voice_mode,
492
- preset_voice_label, clone_audio, add_pauses, progress=gr.Progress()):
 
 
493
  # Resolve text
494
  if file_input is not None:
495
- try:
496
- progress(0.02, desc="Extracting text from document...")
497
- text = extract_text_from_file(file_input)
498
- except gr.Error:
499
- raise
500
- except Exception as e:
501
- raise gr.Error(f"Failed to read file: {e}")
502
  elif text_input and text_input.strip():
503
  text = text_input.strip()
504
  else:
505
  raise gr.Error("Please provide text or upload a file.")
506
-
507
  if len(text) < 10:
508
  raise gr.Error("Text is too short.")
509
 
510
- api_key = os.environ.get("DASHSCOPE_API_KEY", "")
511
- if not api_key:
512
- raise gr.Error("DASHSCOPE_API_KEY not set. Add it in Settings > Secrets.")
513
 
514
  lang_config = LANGUAGES[target_language]
 
515
  use_clone = voice_mode == "Clone a Voice"
 
516
  translate = target_language != "English"
517
- client = OpenAI(api_key=api_key, base_url=BASE_URL)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
518
  tmp_dir = tempfile.mkdtemp(prefix="audiobook_")
519
 
520
  # Voice cloning setup
521
  cloned_voice_id = None
522
  if use_clone:
523
  if clone_audio is None:
524
- raise gr.Error("Please upload a voice sample (10-60 seconds of clear speech).")
525
-
526
  if target_language not in VOICE_CLONE_LANGUAGES:
527
- raise gr.Error(
528
- f"Voice cloning TTS supports: {', '.join(sorted(VOICE_CLONE_LANGUAGES))}. "
529
- f"'{target_language}' is not supported with cloned voices. Use a preset voice instead."
530
- )
531
-
532
- progress(0.03, desc="Cloning voice from audio sample...")
533
- try:
534
- cloned_voice_id = clone_voice(clone_audio, api_key)
535
- progress(0.08, desc="Voice cloned successfully!")
536
- except Exception as e:
537
- raise gr.Error(f"Voice cloning failed: {e}")
538
 
539
  try:
540
- # Split text
541
- progress(0.10, desc="Splitting text into chunks...")
542
  chunks = split_text_into_chunks(text)
543
  total_chunks = len(chunks)
544
  total_chars = sum(len(c) for c in chunks)
545
 
546
- # Process each chunk
547
- audio_files = []
548
- all_transcripts = []
549
  silence_path = os.path.join(tmp_dir, "silence.wav")
550
  if add_pauses:
551
  generate_silence(1.5, silence_path)
552
 
553
  for i, chunk in enumerate(chunks):
554
- frac = 0.12 + 0.75 * (i / total_chunks)
555
  progress(frac, desc=f"Narrating chunk {i+1}/{total_chunks}...")
556
 
557
- if use_clone:
558
- # CLONED VOICE PIPELINE
559
- final_text = chunk
560
- if translate:
561
- try:
562
- final_text = translate_text(client, chunk, target_language, lang_config)
563
- all_transcripts.append(final_text)
564
- except Exception as e:
565
- all_transcripts.append(f"Translation failed for chunk {i+1}: {e}")
566
- final_text = chunk
567
-
568
- wav_path, error = synthesize_with_cloned_voice(
569
- final_text, cloned_voice_id, target_language, api_key, tmp_dir, i,
570
- )
571
 
572
- if wav_path:
573
- audio_files.append(wav_path)
574
- else:
575
- all_transcripts.append(f"TTS failed for chunk {i+1}: {error}")
576
- fail_silence = os.path.join(tmp_dir, f"fail_{i:04d}.wav")
577
- generate_silence(2.0, fail_silence)
578
- audio_files.append(fail_silence)
 
 
 
 
 
 
579
  else:
580
- # PRESET VOICE PIPELINE
581
  voice = get_voice_name(preset_voice_label)
582
  wav_path, transcript = generate_speech_preset(
583
  client, chunk, voice, target_language,
584
  lang_config, translate, i, tmp_dir,
585
  )
 
586
 
587
- if wav_path:
588
- audio_files.append(wav_path)
589
- else:
590
- all_transcripts.append(f"Chunk {i+1} failed: {transcript}")
591
- fail_silence = os.path.join(tmp_dir, f"fail_{i:04d}.wav")
592
- generate_silence(2.0, fail_silence)
593
- audio_files.append(fail_silence)
594
 
595
- if transcript and "failed" not in transcript.lower():
596
- all_transcripts.append(transcript)
597
 
598
- # Pause between chunks
599
  if add_pauses and i < total_chunks - 1 and audio_files:
600
  audio_files.append(silence_path)
601
 
602
  if not audio_files:
603
  raise gr.Error("No audio was generated.")
604
 
605
- # Concatenate
606
  progress(0.90, desc="Assembling audiobook...")
607
  final_audio = os.path.join(tmp_dir, "audiobook.wav")
608
  concatenate_wavs(audio_files, final_audio)
609
 
610
- # Convert to MP3
611
  progress(0.95, desc="Converting to MP3...")
612
  final_mp3 = os.path.join(tmp_dir, "audiobook.mp3")
613
  subprocess.run(
614
- ["ffmpeg", "-y", "-i", final_audio,
615
- "-codec:a", "libmp3lame", "-b:a", "128k",
616
- "-ar", "24000", "-ac", "1", final_mp3],
617
  capture_output=True, check=True,
618
  )
619
 
620
  progress(1.0, desc="Done!")
621
 
622
  audio_size = os.path.getsize(final_mp3) / (1024 * 1024)
623
- voice_info = f"Cloned voice (ID: {cloned_voice_id[:20]}...)" if use_clone else preset_voice_label
 
 
 
 
 
 
 
 
 
624
  stats = (
625
  f"**Audiobook Generated!**\n\n"
626
  f"- **Source:** {total_chars:,} characters in {total_chunks} chunks\n"
627
  f"- **Language:** {target_language} ({lang_config['native']})\n"
628
  f"- **Voice:** {voice_info}\n"
629
- f"- **Mode:** {'Voice Clone via Qwen3-TTS-VC' if use_clone else 'Preset via Qwen3.5-Omni-Plus'}\n"
630
  f"- **File size:** {audio_size:.1f} MB\n"
631
  )
632
- if lang_config["tier"] == "extended" and not use_clone:
633
- stats += "\n> Note: Extended language - voice quality may vary."
634
 
635
  transcript_text = "\n\n---\n\n".join(all_transcripts) if all_transcripts else ""
636
-
637
  return final_mp3, stats, transcript_text
638
 
639
  except gr.Error:
@@ -642,20 +640,18 @@ def generate_audiobook(text_input, file_input, target_language, voice_mode,
642
  raise gr.Error(f"Pipeline error: {str(e)}")
643
 
644
 
645
- # ==============================
646
  # GRADIO UI
647
- # ==============================
648
  SAMPLE_TEXT = """Chapter 1: The Beginning
649
 
650
- The old lighthouse stood at the edge of the world, or so it seemed to the girl who had lived in its shadow all her life. Each morning, she would climb the winding iron staircase to the lamp room, counting exactly one hundred and forty-seven steps, and watch the sun rise from the sea like a great golden coin tossed by the gods.
651
-
652
- "One day," she whispered to the seagulls that perched on the railing, "I'll follow that sun to wherever it goes."
653
 
654
- The gulls, as always, said nothing. They merely tilted their heads and regarded her with ancient, knowing eyes before launching themselves into the wind.
655
 
656
- Her name was Elena, and she was seventeen years old. She had hair the color of dark honey and eyes that changed with the weather - grey in storms, green in sunlight, and something altogether different in the strange purple twilight that sometimes settled over the coast in autumn.
657
 
658
- The lighthouse keeper, her grandfather, was a man of few words but many stories. He kept them locked away like treasures in a chest, only bringing them out on winter nights when the storms howled outside and the old building trembled like a living thing.
659
 
660
  "Tell me about the ships," Elena would say, curling up in the worn armchair by the fire.
661
 
@@ -663,111 +659,127 @@ And he would smile - that slow, careful smile that seemed to cost him something
663
 
664
  DESCRIPTION = """
665
  # Audiobook Generator
666
- ### English Text to Multi-Language Audiobook with Voice Cloning
 
667
 
668
- Upload English text and generate a narrated audiobook in **selected languages**.
669
- Choose a **preset voice** or **clone any voice** from a short audio sample!
 
 
 
670
 
 
671
  """
672
 
 
673
  lang_choices = []
674
- for name, cfg in LANGUAGES.items():
675
- if cfg["tier"] == "core":
676
- lang_choices.append(f"* {name}")
677
- for name, cfg in LANGUAGES.items():
678
- if cfg["tier"] == "extended":
679
- lang_choices.append(name)
 
 
 
 
 
 
 
 
 
 
 
 
680
 
681
 
682
  def clean_language_name(choice):
683
- return choice.replace("* ", "").strip()
 
 
 
 
 
 
 
684
 
685
 
686
  def on_voice_mode_change(mode):
687
  if mode == "Clone a Voice":
688
- return gr.update(visible=False), gr.update(visible=True), gr.update(visible=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
689
  else:
690
- return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
691
 
692
 
693
  def generate_wrapper(text_input, file_input, language_choice, voice_mode,
694
- preset_voice, clone_audio, add_pauses, progress=gr.Progress()):
 
695
  language = clean_language_name(language_choice)
696
  return generate_audiobook(
697
  text_input, file_input, language, voice_mode,
698
- preset_voice, clone_audio, add_pauses, progress,
 
699
  )
700
 
701
 
702
- with gr.Blocks(
703
- title="Audiobook Generator",
704
- theme=gr.themes.Soft(
705
- primary_hue="indigo",
706
- secondary_hue="purple",
707
- neutral_hue="slate",
708
- ),
709
- ) as demo:
710
 
711
  gr.Markdown(DESCRIPTION)
712
 
713
  with gr.Row():
714
  with gr.Column(scale=1):
715
- text_input = gr.Textbox(
716
- label="English Text",
717
- placeholder="Paste your English text here...",
718
- lines=10,
719
- max_lines=25,
720
- )
721
- file_input = gr.File(
722
- label="Or Upload a Document (.txt, .md, .pdf, .docx)",
723
- file_types=[".txt", ".md", ".text", ".pdf", ".docx", ".doc"],
724
- type="filepath",
725
- )
726
  sample_btn = gr.Button("Load Sample Text", variant="secondary", size="sm")
727
 
728
- target_lang = gr.Dropdown(
729
- choices=lang_choices,
730
- value="* English",
731
- label="Target Language",
732
- info="* = Core (best quality). Voice cloning supports core languages only.",
733
- )
734
 
735
  voice_mode = gr.Radio(
736
- choices=["Preset Voice", "Clone a Voice"],
737
- value="Preset Voice",
738
- label="Voice Mode",
739
- )
740
-
741
- preset_voice = gr.Dropdown(
742
- choices=PRESET_VOICES,
743
- value="Jennifer -- Cinematic narrator",
744
- label="Preset Narrator Voice",
745
- visible=True,
746
  )
747
 
748
- clone_audio = gr.Audio(
749
- label="Upload Voice Sample (10 seconds to 3 minutes, WAV/MP3/M4A)",
750
- type="filepath",
751
- visible=False,
752
- )
753
 
 
 
754
  clone_info = gr.Markdown(
755
- value=(
756
- "> **Voice cloning tips:**\n"
757
- "> - Use 10 seconds to 3 minutes of clear, single-speaker audio\n"
758
- "> - Longer samples give better voice quality (auto-trimmed to best 60s)\n"
759
- "> - No background music or noise\n"
760
- "> - WAV (16-bit), MP3, or M4A format\n"
761
- "> - Cloned voice TTS supports 10 core languages only"
762
- ),
763
  visible=False,
764
  )
765
 
766
- add_pauses = gr.Checkbox(
767
- value=True,
768
- label="Add pauses between sections",
769
- info="1.5s silence between chunks",
770
- )
 
 
 
 
 
 
771
 
772
  generate_btn = gr.Button("Generate Audiobook", variant="primary", size="lg")
773
 
@@ -779,22 +791,29 @@ with gr.Blocks(
779
 
780
  sample_btn.click(fn=lambda: SAMPLE_TEXT, outputs=text_input)
781
 
 
 
 
782
  voice_mode.change(
783
- fn=on_voice_mode_change,
784
- inputs=voice_mode,
785
- outputs=[preset_voice, clone_audio, clone_info],
786
  )
787
 
788
  generate_btn.click(
789
  fn=generate_wrapper,
790
  inputs=[text_input, file_input, target_lang, voice_mode,
791
- preset_voice, clone_audio, add_pauses],
792
  outputs=[audio_output, stats_output, transcript_output],
793
  )
794
 
795
  gr.Markdown(
796
  "---\n"
797
-
 
 
 
 
 
798
  )
799
 
800
  if __name__ == "__main__":
 
1
  """
2
  Audiobook Generator - English Source to Multi-Language Audio
3
+ Powered by:
4
+ - Qwen3.5-Omni-Plus (preset voices, 36 languages)
5
+ - Qwen3-TTS-VC (voice cloning, 10 languages)
6
+ - YourVoic API (1000+ emotional voices, 93+ languages including Arabic, Swahili, Indian languages)
 
 
7
 
8
  Deploy as a Hugging Face Space:
9
  1. Create a new Space (SDK: Gradio)
10
  2. Upload app.py and requirements.txt
11
+ 3. Add secrets: DASHSCOPE_API_KEY (required), YOURVOIC_API_KEY (optional)
12
  """
13
 
14
  import os
15
  import base64
16
  import json
 
17
  import pathlib
18
  import shutil
19
  import struct
 
26
  import requests as http_requests
27
  from openai import OpenAI
28
 
 
29
  try:
30
  import pypdf
31
  HAS_PYPDF = True
 
38
  except ImportError:
39
  HAS_DOCX = False
40
 
41
+ # ==========================================
42
+ # CONFIGURATION
43
+ # ==========================================
44
  OMNI_MODEL = "qwen3.5-omni-plus"
45
  TTS_VC_MODEL = "qwen3-tts-vc-2026-01-22"
46
  VOICE_CLONE_MODEL = "qwen-voice-enrollment"
47
 
48
+ DASHSCOPE_BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
49
  DASHSCOPE_API_URL = "https://dashscope-intl.aliyuncs.com/api/v1"
50
  VOICE_CLONE_URL = f"{DASHSCOPE_API_URL}/services/audio/tts/customization"
51
  TTS_SYNTHESIS_URL = f"{DASHSCOPE_API_URL}/services/aigc/multimodal-generation/generation"
52
 
53
+ # YourVoic API
54
+ YOURVOIC_TTS_URL = "https://yourvoic.com/api/v1/tts/generate"
55
+ YOURVOIC_VOICES_URL = "https://yourvoic.com/api/v1/voices"
56
+
57
  MAX_CHARS_PER_CHUNK = 1500
58
 
59
+ # ==========================================
60
+ # LANGUAGES - split by engine
61
+ # ==========================================
62
+ # "engine": "qwen" = Qwen Preset + Clone, "yourvoic" = YourVoic only, "both" = available on both
63
  LANGUAGES = {
64
+ # -- Qwen Core (11 languages: 10 starred + Arabic) --
65
+ "English": {"code": "en", "engine": "qwen", "yourvoic": "en-US"},
66
+ "Chinese (Mandarin)": {"code": "zh", "engine": "qwen", "yourvoic": "zh-CN"},
67
+ "Japanese": {"code": "ja", "engine": "qwen", "yourvoic": "ja-JP"},
68
+ "Korean": {"code": "ko", "engine": "qwen", "yourvoic": "ko-KR"},
69
+ "German": {"code": "de", "engine": "qwen", "yourvoic": "de-DE"},
70
+ "French": {"code": "fr", "engine": "qwen", "yourvoic": "fr-FR"},
71
+ "Russian": {"code": "ru", "engine": "qwen", "yourvoic": "ru-RU"},
72
+ "Portuguese": {"code": "pt", "engine": "qwen", "yourvoic": "pt-BR"},
73
+ "Spanish": {"code": "es", "engine": "qwen", "yourvoic": "es-ES"},
74
+ "Italian": {"code": "it", "engine": "qwen", "yourvoic": "it-IT"},
75
+ "Arabic": {"code": "ar", "engine": "qwen", "yourvoic": "ar-SA"},
76
+ # -- YourVoic: African Languages --
77
+ "Afrikaans": {"code": "af", "engine": "yourvoic", "yourvoic": "af-ZA"},
78
+ "Amharic": {"code": "am", "engine": "yourvoic", "yourvoic": "am-ET"},
79
+ "Swahili": {"code": "sw", "engine": "yourvoic", "yourvoic": "sw-KE"},
80
+ # -- YourVoic: Core Indian Languages --
81
+ "Hindi": {"code": "hi", "engine": "yourvoic", "yourvoic": "hi-IN"},
82
+ "Marathi": {"code": "mr", "engine": "yourvoic", "yourvoic": "mr-IN"},
83
+ "Bengali": {"code": "bn", "engine": "yourvoic", "yourvoic": "bn-IN"},
84
+ "Telugu": {"code": "te", "engine": "yourvoic", "yourvoic": "te-IN"},
85
+ "Tamil": {"code": "ta", "engine": "yourvoic", "yourvoic": "ta-IN"},
86
+ "Gujarati": {"code": "gu", "engine": "yourvoic", "yourvoic": "gu-IN"},
87
+ "Kannada": {"code": "kn", "engine": "yourvoic", "yourvoic": "kn-IN"},
88
+ "Malayalam": {"code": "ml", "engine": "yourvoic", "yourvoic": "ml-IN"},
89
+ "Punjabi": {"code": "pa", "engine": "yourvoic", "yourvoic": "pa-IN"},
90
+ "Odia": {"code": "or", "engine": "yourvoic", "yourvoic": "or-IN"},
91
+ "Assamese": {"code": "as", "engine": "yourvoic", "yourvoic": "as-IN"},
92
+ "Maithili": {"code": "mai", "engine": "yourvoic", "yourvoic": "mai-IN"},
93
+ "Kashmiri": {"code": "ks", "engine": "yourvoic", "yourvoic": "ks-IN"},
94
+ "Sindhi": {"code": "sd", "engine": "yourvoic", "yourvoic": "sd-IN"},
95
+ "Konkani": {"code": "kok", "engine": "yourvoic", "yourvoic": "kok-IN"},
96
+ "Dogri": {"code": "doi", "engine": "yourvoic", "yourvoic": "doi-IN"},
97
+ "Manipuri": {"code": "mni", "engine": "yourvoic", "yourvoic": "mni-IN"},
98
+ "Bodo": {"code": "brx", "engine": "yourvoic", "yourvoic": "brx-IN"},
99
+ "Sanskrit": {"code": "sa", "engine": "yourvoic", "yourvoic": "sa-IN"},
100
+ # -- YourVoic: Related South Asian Languages --
101
+ "Urdu": {"code": "ur", "engine": "yourvoic", "yourvoic": "ur-PK"},
102
+ "Nepali": {"code": "ne", "engine": "yourvoic", "yourvoic": "ne-NP"},
103
+ "Sinhala": {"code": "si", "engine": "yourvoic", "yourvoic": "si-LK"},
104
  }
105
 
106
+ # Qwen languages (for preset + clone)
107
+ QWEN_LANGUAGES = {k for k, v in LANGUAGES.items() if v["engine"] == "qwen"}
108
+
109
+ # Voice cloning only supports the original 10 (not Arabic)
110
  VOICE_CLONE_LANGUAGES = {
111
  "English", "Chinese (Mandarin)", "Japanese", "Korean", "German",
112
  "French", "Russian", "Portuguese", "Spanish", "Italian",
113
  }
114
 
115
+ # YourVoic languages
116
+ YOURVOIC_LANGUAGES = {k for k, v in LANGUAGES.items() if v["engine"] == "yourvoic"}
117
+
118
  PRESET_VOICES = [
119
+ "Cherry -- Sunny, friendly", "Serena -- Gentle, soft",
120
+ "Jennifer -- Cinematic narrator", "Katerina -- Mature, rich rhythm",
121
+ "Ethan -- Warm, energetic", "Ryan -- Dramatic, rhythmic",
122
+ "Kai -- Soothing, calm", "Neil -- Precise, clear",
123
+ "Lenn -- Rational, steady", "Eldric Sage -- Authoritative narrator",
124
+ "Arthur -- Classic, mature", "Bella -- Elegant, warm",
125
+ "Vivian -- Professional, clear", "Seren -- Calm, measured",
126
+ "Dolce -- Sweet, melodic", "Bellona -- Strong, commanding",
127
+ "Vincent -- Rich, theatrical", "Andre -- Deep, resonant",
128
+ "Mia -- Young, versatile", "Aiden -- Young, lively",
129
+ ]
130
+
131
+ # YourVoic voices organized by language
132
+ YOURVOIC_VOICES = [
133
+ "Peter -- English, Professional male",
134
+ "Sarah -- English, Warm female",
135
+ "Caleb -- English, Expressive male",
136
+ "Natasha -- Hindi, Versatile female",
137
+ "Rahul -- Hindi, Friendly male",
138
+ "Deepika -- Hindi, Professional female",
139
+ "Aditya -- Hindi, Lively male",
140
+ "Priya -- Tamil, Caring female",
141
+ "Sneha -- Bengali, Wise female",
142
+ "Arjun -- Telugu, Strong male",
143
+ "Divya -- Kannada, Expressive female",
144
+ "Anjali -- Marathi, Professional female",
145
+ "Vikram -- Punjabi, Warm male",
146
+ "Kavya -- Odia, Energetic female",
147
+ "Nikhil -- Malayalam, Advanced male",
148
+ ]
149
+
150
+ YOURVOIC_MODELS = [
151
+ "aura-max -- Premium quality (best for audiobooks)",
152
+ "aura-prime -- Balanced quality and speed",
153
+ "aura-lite -- Fast, good for previews",
154
+ "rapid-max -- Fast with good quality",
155
+ "rapid-flash -- Fastest, real-time apps",
156
+ ]
157
+
158
+ YOURVOIC_EMOTIONS = [
159
+ "neutral", "friendly", "hopeful", "cheerful", "sad",
160
+ "excited", "angry", "terrified", "shouting", "whispering",
161
  ]
162
 
163
 
 
165
  return label.split("--")[0].strip()
166
 
167
 
168
+ def get_yourvoic_model(label):
169
+ return label.split("--")[0].strip()
170
+
171
+
172
+ # ==========================================
173
+ # AUDIO HELPERS
174
+ # ==========================================
175
  def base64_to_wav(b64_data, output_path):
176
  audio_bytes = base64.b64decode(b64_data)
177
+ sr, nc, bps = 24000, 1, 16
 
 
178
  br = sr * nc * bps // 8
179
  ba = nc * bps // 8
180
  ds = len(audio_bytes)
 
215
 
216
  def generate_silence(duration_sec, output_path):
217
  subprocess.run(
218
+ ["ffmpeg", "-y", "-f", "lavfi", "-i", "anullsrc=r=24000:cl=mono",
 
219
  "-t", str(duration_sec), "-acodec", "pcm_s16le", output_path],
220
  capture_output=True, check=True,
221
  )
222
 
223
 
224
+ # ==========================================
225
+ # DOCUMENT EXTRACTION
226
+ # ==========================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  def extract_text_from_file(filepath):
228
  ext = os.path.splitext(filepath)[1].lower()
229
  if ext == ".pdf":
230
+ if not HAS_PYPDF:
231
+ raise gr.Error("pypdf not installed.")
232
+ reader = pypdf.PdfReader(filepath)
233
+ return "\n\n".join(p.extract_text().strip() for p in reader.pages if p.extract_text())
234
  elif ext in (".docx", ".doc"):
235
  if ext == ".doc":
236
+ raise gr.Error("Please save as .docx or .pdf.")
237
+ if not HAS_DOCX:
238
+ raise gr.Error("python-docx not installed.")
239
+ doc = docx.Document(filepath)
240
+ return "\n\n".join(p.text.strip() for p in doc.paragraphs if p.text.strip())
 
 
 
 
 
 
 
 
 
 
 
 
241
  else:
242
  with open(filepath, "r", encoding="utf-8", errors="replace") as f:
243
  return f.read()
244
 
245
 
246
+ # ==========================================
247
+ # TEXT SPLITTING
248
+ # ==========================================
249
  def split_text_into_chunks(text, max_chars=MAX_CHARS_PER_CHUNK):
250
  text = text.strip()
251
  if not text:
252
  return []
253
  if len(text) <= max_chars:
254
  return [text]
255
+ chunks, paragraphs, current = [], re.split(r"\n\s*\n", text), ""
 
 
 
 
256
  for para in paragraphs:
257
  para = para.strip()
258
  if not para:
259
  continue
260
+ if len(current) + len(para) + 2 <= max_chars:
261
+ current = (current + "\n\n" + para).strip()
262
  else:
263
+ if current:
264
+ chunks.append(current)
 
265
  if len(para) > max_chars:
266
  sentences = re.split(r"(?<=[.!?])\s+", para)
267
+ current = ""
268
+ for s in sentences:
269
+ if len(current) + len(s) + 1 <= max_chars:
270
+ current = (current + " " + s).strip()
271
  else:
272
+ if current:
273
+ chunks.append(current)
274
+ current = s
 
 
 
 
 
 
 
 
 
 
 
275
  else:
276
+ current = para
277
+ if current:
278
+ chunks.append(current)
 
279
  return chunks
280
 
281
 
282
+ # ==========================================
283
+ # VOICE CLONING (Qwen)
284
+ # ==========================================
285
  def prepare_clone_audio(audio_path):
 
 
 
 
 
 
 
 
286
  result = subprocess.run(
287
  ["ffprobe", "-v", "quiet", "-show_entries", "format=duration",
288
  "-of", "default=noprint_wrappers=1:nokey=1", audio_path],
289
  capture_output=True, text=True,
290
  )
291
  duration = float(result.stdout.strip())
 
292
  if duration < 10:
293
+ raise ValueError(f"Audio too short ({duration:.1f}s). Need at least 10 seconds.")
294
+ tmp = audio_path + "_prepared.wav"
 
 
 
 
 
 
295
  if duration <= 60:
296
+ subprocess.run(["ffmpeg", "-y", "-i", audio_path, "-ar", "24000", "-ac", "1",
297
+ "-acodec", "pcm_s16le", tmp], capture_output=True, check=True)
 
 
 
 
 
298
  else:
 
299
  start = min(5, duration - 60)
300
+ subprocess.run(["ffmpeg", "-y", "-ss", str(start), "-t", "60", "-i", audio_path,
301
+ "-ar", "24000", "-ac", "1", "-acodec", "pcm_s16le", tmp],
302
+ capture_output=True, check=True)
303
+ return tmp
 
 
 
 
 
304
 
305
 
306
+ def clone_voice(audio_path, api_key):
307
+ prepared = prepare_clone_audio(audio_path)
308
+ b64 = base64.b64encode(pathlib.Path(prepared).read_bytes()).decode()
 
 
 
 
 
 
 
 
 
309
  try:
310
+ os.remove(prepared)
311
  except OSError:
312
  pass
313
+ resp = http_requests.post(VOICE_CLONE_URL, json={
 
314
  "model": VOICE_CLONE_MODEL,
315
  "input": {
316
+ "action": "create", "target_model": TTS_VC_MODEL,
317
+ "preferred_name": "audiobook_voice",
318
+ "audio": {"data": f"data:audio/wav;base64,{b64}"},
 
319
  },
320
+ }, headers={"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}, timeout=60)
 
 
 
 
 
 
321
  if resp.status_code != 200:
322
+ raise RuntimeError(f"Voice clone failed: {resp.text[:300]}")
323
+ return resp.json()["output"]["voice"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
325
 
326
+ # ==========================================
327
+ # TRANSLATION (Qwen text-only)
328
+ # ==========================================
 
 
 
 
329
  def translate_text(client, text, target_language, lang_config):
330
  response = client.chat.completions.create(
331
+ model=OMNI_MODEL, modalities=["text"],
 
332
  messages=[
333
+ {"role": "system", "content": f"Translate English to {target_language}. Output ONLY the translation."},
334
+ {"role": "user", "content": f"Translate:\n\n{text}"},
 
 
 
 
 
 
 
 
 
 
335
  ],
336
  )
337
  return response.choices[0].message.content.strip()
338
 
339
 
340
+ # ==========================================
341
+ # TTS MODE 1: PRESET VOICE (Qwen Omni)
342
+ # ==========================================
343
  def generate_speech_preset(client, text, voice, language, lang_config, translate, chunk_index, output_dir):
344
  output_wav = os.path.join(output_dir, f"chunk_{chunk_index:04d}.wav")
 
345
  if translate and language != "English":
346
+ sys_prompt = (f"Translate English to {language} ({lang_config['native']}) "
347
+ f"and narrate expressively. Respond ONLY with spoken {language} narration.")
348
+ user_text = f"Translate into {language} and narrate:\n\n{text}"
 
 
 
 
349
  else:
350
+ sys_prompt = "Narrate expressively as an audiobook. Respond ONLY with narration."
351
+ user_text = f"Narrate:\n\n{text}"
 
 
 
 
 
352
  try:
353
  completion = client.chat.completions.create(
354
  model=OMNI_MODEL,
355
+ messages=[{"role": "system", "content": sys_prompt}, {"role": "user", "content": user_text}],
356
+ modalities=["text", "audio"], audio={"voice": voice, "format": "wav"},
357
+ stream=True, stream_options={"include_usage": True},
 
 
 
 
 
358
  )
359
+ audio_parts, text_parts = [], []
 
 
 
360
  for event in completion:
361
  if not event.choices:
362
  continue
363
  delta = event.choices[0].delta
364
  if hasattr(delta, "content") and delta.content:
365
+ text_parts.append(delta.content)
366
  if hasattr(delta, "audio") and delta.audio:
367
+ if isinstance(delta.audio, dict) and "data" in delta.audio:
368
+ audio_parts.append(delta.audio["data"])
 
369
  elif hasattr(delta.audio, "data") and delta.audio.data:
370
+ audio_parts.append(delta.audio.data)
371
+ transcript = "".join(text_parts)
372
+ if audio_parts:
373
+ base64_to_wav("".join(audio_parts), output_wav)
 
 
 
374
  return output_wav, transcript
375
  return None, "No audio received"
 
376
  except Exception as e:
377
  return None, str(e)
378
 
379
 
380
+ # ==========================================
381
+ # TTS MODE 2: CLONED VOICE (Qwen TTS-VC)
382
+ # ==========================================
383
+ def generate_speech_cloned(client, text, voice_id, language, lang_config, translate, api_key, chunk_index, output_dir):
384
+ output_wav = os.path.join(output_dir, f"vc_chunk_{chunk_index:04d}.wav")
385
+ final_text = text
386
+ if translate and language != "English":
387
+ final_text = translate_text(client, text, language, lang_config)
388
+ lang_map = {
389
+ "English": "English", "Chinese (Mandarin)": "Chinese", "Japanese": "Japanese",
390
+ "Korean": "Korean", "German": "German", "French": "French",
391
+ "Russian": "Russian", "Portuguese": "Portuguese", "Spanish": "Spanish", "Italian": "Italian",
392
+ }
393
+ resp = http_requests.post(TTS_SYNTHESIS_URL, json={
394
+ "model": TTS_VC_MODEL,
395
+ "input": {"text": final_text, "voice": voice_id, "language_type": lang_map.get(language, "English")},
396
+ }, headers={"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}, timeout=120)
397
+ if resp.status_code != 200:
398
+ return None, final_text, f"TTS failed ({resp.status_code})"
399
+ audio_url = resp.json().get("output", {}).get("audio", {}).get("url")
400
+ if audio_url:
401
+ audio_resp = http_requests.get(audio_url, timeout=120)
402
+ with open(output_wav, "wb") as f:
403
+ f.write(audio_resp.content)
404
+ return output_wav, final_text, None
405
+ return None, final_text, "No audio URL"
406
+
407
+
408
+ # ==========================================
409
+ # TTS MODE 3: YOURVOIC (emotional voices, 93+ languages)
410
+ # ==========================================
411
+ def generate_speech_yourvoic(client, text, voice, yv_model, emotion, language, lang_config, translate,
412
+ api_key, chunk_index, output_dir):
413
+ """Generate speech using YourVoic API. Handles translation via Qwen then TTS via YourVoic."""
414
+ output_file = os.path.join(output_dir, f"yv_chunk_{chunk_index:04d}.mp3")
415
+
416
+ # Translate if needed
417
+ final_text = text
418
+ transcript = text
419
+ if translate and language != "English":
420
+ try:
421
+ ds_key = os.environ.get("DASHSCOPE_API_KEY", "")
422
+ if ds_key and client:
423
+ final_text = translate_text(client, text, language, lang_config)
424
+ transcript = final_text
425
+ except Exception as e:
426
+ print(f"[YourVoic] Translation failed, using English: {e}")
427
+
428
+ # Build request
429
+ yourvoic_lang = lang_config.get("yourvoic", "en-US")
430
+ payload = {
431
+ "text": final_text,
432
+ "voice": voice,
433
+ "language": yourvoic_lang,
434
+ "model": yv_model,
435
+ "speed": 0.9, # Slightly slower for audiobook pacing
436
+ }
437
+ # Add emotion if not neutral
438
+ if emotion and emotion != "neutral":
439
+ payload["emotion"] = emotion
440
+
441
+ headers = {
442
+ "X-API-Key": api_key,
443
+ "Content-Type": "application/json",
444
+ }
445
+
446
+ try:
447
+ resp = http_requests.post(YOURVOIC_TTS_URL, json=payload, headers=headers, timeout=120)
448
+ print(f"[YourVoic] Chunk {chunk_index}: status={resp.status_code}, size={len(resp.content)} bytes")
449
+
450
+ if resp.status_code != 200:
451
+ error_msg = resp.text[:200]
452
+ print(f"[YourVoic] Error: {error_msg}")
453
+ return None, transcript, f"YourVoic API error ({resp.status_code}): {error_msg}"
454
+
455
+ # Check if response is JSON (contains audio_url) or direct audio bytes
456
+ content_type = resp.headers.get("Content-Type", "")
457
+
458
+ if "application/json" in content_type:
459
+ data = resp.json()
460
+ audio_url = data.get("audio_url") or data.get("url")
461
+ if audio_url:
462
+ audio_resp = http_requests.get(audio_url, timeout=120)
463
+ with open(output_file, "wb") as f:
464
+ f.write(audio_resp.content)
465
+ else:
466
+ return None, transcript, f"No audio URL in response: {json.dumps(data)[:200]}"
467
+ else:
468
+ # Direct audio bytes
469
+ with open(output_file, "wb") as f:
470
+ f.write(resp.content)
471
+
472
+ # Convert MP3 to WAV for consistent concatenation
473
+ output_wav = output_file.replace(".mp3", ".wav")
474
+ subprocess.run(
475
+ ["ffmpeg", "-y", "-i", output_file, "-ar", "24000", "-ac", "1",
476
+ "-acodec", "pcm_s16le", output_wav],
477
+ capture_output=True, check=True,
478
+ )
479
+ return output_wav, transcript, None
480
+
481
+ except Exception as e:
482
+ return None, transcript, str(e)
483
+
484
+
485
+ # ==========================================
486
  # MAIN PIPELINE
487
+ # ==========================================
488
  def generate_audiobook(text_input, file_input, target_language, voice_mode,
489
+ preset_voice_label, clone_audio, yourvoic_voice_label,
490
+ yourvoic_model_label, yourvoic_emotion,
491
+ add_pauses, progress=gr.Progress()):
492
  # Resolve text
493
  if file_input is not None:
494
+ progress(0.02, desc="Extracting text from document...")
495
+ text = extract_text_from_file(file_input)
 
 
 
 
 
496
  elif text_input and text_input.strip():
497
  text = text_input.strip()
498
  else:
499
  raise gr.Error("Please provide text or upload a file.")
 
500
  if len(text) < 10:
501
  raise gr.Error("Text is too short.")
502
 
503
+ ds_key = os.environ.get("DASHSCOPE_API_KEY", "")
504
+ yv_key = os.environ.get("YOURVOIC_API_KEY", "")
 
505
 
506
  lang_config = LANGUAGES[target_language]
507
+ lang_engine = lang_config["engine"]
508
  use_clone = voice_mode == "Clone a Voice"
509
+ use_yourvoic = voice_mode == "YourVoic (Emotional AI)"
510
  translate = target_language != "English"
511
+
512
+ # Auto-correct engine if language requires it
513
+ if lang_engine == "yourvoic" and not use_yourvoic:
514
+ # Language only supported by YourVoic, force switch
515
+ use_yourvoic = True
516
+ use_clone = False
517
+ elif lang_engine == "qwen" and use_yourvoic:
518
+ # User chose YourVoic but language is Qwen-only — allow it since YourVoic
519
+ # supports most languages, but Qwen languages also work on YourVoic
520
+ pass
521
+
522
+ # Validate keys
523
+ if use_yourvoic:
524
+ if not yv_key:
525
+ raise gr.Error("YOURVOIC_API_KEY not set. Add it in Settings > Secrets. Get one at yourvoic.com/api/user")
526
+ if translate and not ds_key:
527
+ raise gr.Error("DASHSCOPE_API_KEY needed for translation. Add it in Settings > Secrets.")
528
+ else:
529
+ if not ds_key:
530
+ raise gr.Error("DASHSCOPE_API_KEY not set. Add it in Settings > Secrets.")
531
+
532
+ client = OpenAI(api_key=ds_key, base_url=DASHSCOPE_BASE_URL) if ds_key else None
533
  tmp_dir = tempfile.mkdtemp(prefix="audiobook_")
534
 
535
  # Voice cloning setup
536
  cloned_voice_id = None
537
  if use_clone:
538
  if clone_audio is None:
539
+ raise gr.Error("Upload a voice sample for cloning.")
 
540
  if target_language not in VOICE_CLONE_LANGUAGES:
541
+ raise gr.Error(f"Voice cloning supports: {', '.join(sorted(VOICE_CLONE_LANGUAGES))}")
542
+ progress(0.03, desc="Cloning voice...")
543
+ cloned_voice_id = clone_voice(clone_audio, ds_key)
 
 
 
 
 
 
 
 
544
 
545
  try:
546
+ progress(0.08, desc="Splitting text...")
 
547
  chunks = split_text_into_chunks(text)
548
  total_chunks = len(chunks)
549
  total_chars = sum(len(c) for c in chunks)
550
 
551
+ audio_files, all_transcripts = [], []
 
 
552
  silence_path = os.path.join(tmp_dir, "silence.wav")
553
  if add_pauses:
554
  generate_silence(1.5, silence_path)
555
 
556
  for i, chunk in enumerate(chunks):
557
+ frac = 0.10 + 0.78 * (i / total_chunks)
558
  progress(frac, desc=f"Narrating chunk {i+1}/{total_chunks}...")
559
 
560
+ wav_path, transcript, error = None, None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
561
 
562
+ if use_yourvoic:
563
+ yv_voice = get_voice_name(yourvoic_voice_label)
564
+ yv_model = get_yourvoic_model(yourvoic_model_label)
565
+ wav_path, transcript, error = generate_speech_yourvoic(
566
+ client, chunk, yv_voice, yv_model, yourvoic_emotion,
567
+ target_language, lang_config, translate,
568
+ yv_key, i, tmp_dir,
569
+ )
570
+ elif use_clone:
571
+ wav_path, transcript, error = generate_speech_cloned(
572
+ client, chunk, cloned_voice_id, target_language,
573
+ lang_config, translate, ds_key, i, tmp_dir,
574
+ )
575
  else:
 
576
  voice = get_voice_name(preset_voice_label)
577
  wav_path, transcript = generate_speech_preset(
578
  client, chunk, voice, target_language,
579
  lang_config, translate, i, tmp_dir,
580
  )
581
+ error = None if wav_path else transcript
582
 
583
+ if wav_path:
584
+ audio_files.append(wav_path)
585
+ else:
586
+ all_transcripts.append(f"Chunk {i+1} failed: {error}")
587
+ fail_sil = os.path.join(tmp_dir, f"fail_{i:04d}.wav")
588
+ generate_silence(2.0, fail_sil)
589
+ audio_files.append(fail_sil)
590
 
591
+ if transcript and "failed" not in str(transcript).lower():
592
+ all_transcripts.append(transcript)
593
 
 
594
  if add_pauses and i < total_chunks - 1 and audio_files:
595
  audio_files.append(silence_path)
596
 
597
  if not audio_files:
598
  raise gr.Error("No audio was generated.")
599
 
 
600
  progress(0.90, desc="Assembling audiobook...")
601
  final_audio = os.path.join(tmp_dir, "audiobook.wav")
602
  concatenate_wavs(audio_files, final_audio)
603
 
 
604
  progress(0.95, desc="Converting to MP3...")
605
  final_mp3 = os.path.join(tmp_dir, "audiobook.mp3")
606
  subprocess.run(
607
+ ["ffmpeg", "-y", "-i", final_audio, "-codec:a", "libmp3lame",
608
+ "-b:a", "128k", "-ar", "24000", "-ac", "1", final_mp3],
 
609
  capture_output=True, check=True,
610
  )
611
 
612
  progress(1.0, desc="Done!")
613
 
614
  audio_size = os.path.getsize(final_mp3) / (1024 * 1024)
615
+ if use_yourvoic:
616
+ voice_info = f"YourVoic: {yourvoic_voice_label} ({yourvoic_emotion})"
617
+ mode_info = f"YourVoic API ({yourvoic_model_label})"
618
+ elif use_clone:
619
+ voice_info = f"Cloned (ID: {cloned_voice_id[:20]}...)"
620
+ mode_info = "Qwen3-TTS-VC"
621
+ else:
622
+ voice_info = preset_voice_label
623
+ mode_info = "Qwen3.5-Omni-Plus"
624
+
625
  stats = (
626
  f"**Audiobook Generated!**\n\n"
627
  f"- **Source:** {total_chars:,} characters in {total_chunks} chunks\n"
628
  f"- **Language:** {target_language} ({lang_config['native']})\n"
629
  f"- **Voice:** {voice_info}\n"
630
+ f"- **Engine:** {mode_info}\n"
631
  f"- **File size:** {audio_size:.1f} MB\n"
632
  )
 
 
633
 
634
  transcript_text = "\n\n---\n\n".join(all_transcripts) if all_transcripts else ""
 
635
  return final_mp3, stats, transcript_text
636
 
637
  except gr.Error:
 
640
  raise gr.Error(f"Pipeline error: {str(e)}")
641
 
642
 
643
+ # ==========================================
644
  # GRADIO UI
645
+ # ==========================================
646
  SAMPLE_TEXT = """Chapter 1: The Beginning
647
 
648
+ The old lighthouse stood at the edge of the world, or so it seemed to the girl who had lived in its shadow all her life. Each morning, she would climb the winding iron staircase to the lamp room, counting exactly one hundred and forty-seven steps, and watch the sun rise from the sea like a great golden coin.
 
 
649
 
650
+ "One day," she whispered to the seagulls, "I'll follow that sun to wherever it goes."
651
 
652
+ Her name was Elena, and she was seventeen years old. She had hair the color of dark honey and eyes that changed with the weather - grey in storms, green in sunlight.
653
 
654
+ The lighthouse keeper, her grandfather, was a man of few words but many stories. He kept them locked away like treasures in a chest, only bringing them out on winter nights when the storms howled outside.
655
 
656
  "Tell me about the ships," Elena would say, curling up in the worn armchair by the fire.
657
 
 
659
 
660
  DESCRIPTION = """
661
  # Audiobook Generator
662
+ ### English Text to Multi-Language Audiobook
663
+ **Three Voice Engines**
664
 
665
+ | Engine | Languages | Best for |
666
+ |--------|-----------|----------|
667
+ | **Qwen Preset** (20 voices) | English, Chinese, Japanese, Korean, German, French, Russian, Portuguese, Spanish, Italian, Arabic | General audiobooks |
668
+ | **Qwen Clone** (your voice) | Same 10 core (excl. Arabic) | Personalized narration |
669
+ | **YourVoic** (1000+ emotional voices) | African: Afrikaans, Amharic, Swahili. Indian: Hindi, Marathi, Bengali, Telugu, Tamil + 16 more. South Asian: Urdu, Nepali, Sinhala | Indian/African languages, emotional narration |
670
 
671
+ The app automatically selects the right engine based on your chosen language. Or pick manually!
672
  """
673
 
674
+ # Build language dropdown grouped by engine
675
  lang_choices = []
676
+ # Qwen languages first (starred)
677
+ for name in LANGUAGES:
678
+ if LANGUAGES[name]["engine"] == "qwen":
679
+ lang_choices.append(f"Q: {name}")
680
+ # African languages
681
+ for name in ["Afrikaans", "Amharic", "Swahili"]:
682
+ if name in LANGUAGES:
683
+ lang_choices.append(f"YV: {name}")
684
+ # Indian languages
685
+ for name in ["Hindi", "Marathi", "Bengali", "Telugu", "Tamil", "Gujarati", "Kannada",
686
+ "Malayalam", "Punjabi", "Odia", "Assamese", "Maithili", "Kashmiri",
687
+ "Sindhi", "Konkani", "Dogri", "Manipuri", "Bodo", "Sanskrit"]:
688
+ if name in LANGUAGES:
689
+ lang_choices.append(f"YV: {name}")
690
+ # South Asian
691
+ for name in ["Urdu", "Nepali", "Sinhala"]:
692
+ if name in LANGUAGES:
693
+ lang_choices.append(f"YV: {name}")
694
 
695
 
696
  def clean_language_name(choice):
697
+ return choice.replace("Q: ", "").replace("YV: ", "").replace("* ", "").strip()
698
+
699
+
700
+ def auto_select_engine(language_name):
701
+ """Auto-select the right voice engine based on language."""
702
+ if language_name in LANGUAGES:
703
+ return LANGUAGES[language_name]["engine"]
704
+ return "qwen"
705
 
706
 
707
  def on_voice_mode_change(mode):
708
  if mode == "Clone a Voice":
709
+ return (gr.update(visible=False), gr.update(visible=True), gr.update(visible=True),
710
+ gr.update(visible=False), gr.update(visible=False), gr.update(visible=False))
711
+ elif mode == "YourVoic (Emotional AI)":
712
+ return (gr.update(visible=False), gr.update(visible=False), gr.update(visible=False),
713
+ gr.update(visible=True), gr.update(visible=True), gr.update(visible=True))
714
+ else: # Preset Voice
715
+ return (gr.update(visible=True), gr.update(visible=False), gr.update(visible=False),
716
+ gr.update(visible=False), gr.update(visible=False), gr.update(visible=False))
717
+
718
+
719
+ def on_language_change(lang_choice):
720
+ """Auto-switch voice engine when language changes."""
721
+ lang = clean_language_name(lang_choice)
722
+ engine = auto_select_engine(lang)
723
+ if engine == "yourvoic":
724
+ return gr.update(value="YourVoic (Emotional AI)")
725
  else:
726
+ return gr.update(value="Preset Voice")
727
 
728
 
729
  def generate_wrapper(text_input, file_input, language_choice, voice_mode,
730
+ preset_voice, clone_audio, yv_voice, yv_model, yv_emotion,
731
+ add_pauses, progress=gr.Progress()):
732
  language = clean_language_name(language_choice)
733
  return generate_audiobook(
734
  text_input, file_input, language, voice_mode,
735
+ preset_voice, clone_audio, yv_voice, yv_model, yv_emotion,
736
+ add_pauses, progress,
737
  )
738
 
739
 
740
+ with gr.Blocks(title="Audiobook Generator") as demo:
 
 
 
 
 
 
 
741
 
742
  gr.Markdown(DESCRIPTION)
743
 
744
  with gr.Row():
745
  with gr.Column(scale=1):
746
+ text_input = gr.Textbox(label="English Text", placeholder="Paste your English text here...",
747
+ lines=10, max_lines=25)
748
+ file_input = gr.File(label="Or Upload (.txt, .md, .pdf, .docx)",
749
+ file_types=[".txt", ".md", ".text", ".pdf", ".docx", ".doc"], type="filepath")
 
 
 
 
 
 
 
750
  sample_btn = gr.Button("Load Sample Text", variant="secondary", size="sm")
751
 
752
+ target_lang = gr.Dropdown(choices=lang_choices, value="Q: English", label="Target Language",
753
+ info="Q: = Qwen engine, YV: = YourVoic engine. Auto-switches voice engine.")
 
 
 
 
754
 
755
  voice_mode = gr.Radio(
756
+ choices=["Preset Voice", "Clone a Voice", "YourVoic (Emotional AI)"],
757
+ value="Preset Voice", label="Voice Engine",
 
 
 
 
 
 
 
 
758
  )
759
 
760
+ # Preset voice controls
761
+ preset_voice = gr.Dropdown(choices=PRESET_VOICES, value="Jennifer -- Cinematic narrator",
762
+ label="Qwen Preset Voice", visible=True)
 
 
763
 
764
+ # Clone voice controls
765
+ clone_audio = gr.Audio(label="Voice Sample (10s-3min)", type="filepath", visible=False)
766
  clone_info = gr.Markdown(
767
+ value=("> **Voice cloning tips:** 10-180s clear speech, no background noise. "
768
+ "Supports 10 core languages only."),
 
 
 
 
 
 
769
  visible=False,
770
  )
771
 
772
+ # YourVoic controls
773
+ yv_voice = gr.Dropdown(choices=YOURVOIC_VOICES, value="Peter -- English, Professional male",
774
+ label="YourVoic Voice", visible=False, allow_custom_value=True,
775
+ info="Type any voice name or pick from the list")
776
+ yv_model = gr.Dropdown(choices=YOURVOIC_MODELS, value="aura-max -- Premium quality (best for audiobooks)",
777
+ label="YourVoic Model", visible=False)
778
+ yv_emotion = gr.Dropdown(choices=YOURVOIC_EMOTIONS, value="friendly",
779
+ label="Emotion Style", visible=False,
780
+ info="Add emotional expression to the narration")
781
+
782
+ add_pauses = gr.Checkbox(value=True, label="Add pauses between sections", info="1.5s silence between chunks")
783
 
784
  generate_btn = gr.Button("Generate Audiobook", variant="primary", size="lg")
785
 
 
791
 
792
  sample_btn.click(fn=lambda: SAMPLE_TEXT, outputs=text_input)
793
 
794
+ # Auto-switch voice engine when language changes
795
+ target_lang.change(fn=on_language_change, inputs=target_lang, outputs=[voice_mode])
796
+
797
  voice_mode.change(
798
+ fn=on_voice_mode_change, inputs=voice_mode,
799
+ outputs=[preset_voice, clone_audio, clone_info, yv_voice, yv_model, yv_emotion],
 
800
  )
801
 
802
  generate_btn.click(
803
  fn=generate_wrapper,
804
  inputs=[text_input, file_input, target_lang, voice_mode,
805
+ preset_voice, clone_audio, yv_voice, yv_model, yv_emotion, add_pauses],
806
  outputs=[audio_output, stats_output, transcript_output],
807
  )
808
 
809
  gr.Markdown(
810
  "---\n"
811
+ "**Engines:**\n\n"
812
+ "**Qwen Preset:** 11 languages (EN, ZH, JA, KO, DE, FR, RU, PT, ES, IT, AR) via Qwen3.5-Omni-Plus\n\n"
813
+ "**Qwen Clone:** 10 languages (same minus Arabic) via Qwen3-TTS-VC\n\n"
814
+ "**YourVoic:** African (Afrikaans, Amharic, Swahili) + Indian (Hindi, Tamil, Telugu + 17 more) "
815
+ "+ South Asian (Urdu, Nepali, Sinhala) via YourVoic API with emotional voices\n\n"
816
+ "Built with Gradio | Qwen by Alibaba | YourVoic by YourVoic Private Limited"
817
  )
818
 
819
  if __name__ == "__main__":