Nymbo commited on
Commit
574e025
·
verified ·
1 Parent(s): 279d766

adding 53 more voices to Kokoro-TTS

Browse files
Files changed (1) hide show
  1. app.py +100 -17
app.py CHANGED
@@ -1,8 +1,8 @@
1
- # Purpose: One Space that offers four tools/tabs (all exposed as MCP tools):
2
  # 1) Fetch — extract relevant page content (title, metadata, clean text, hyperlinks)
3
  # 2) DuckDuckGo Search — compact JSONL search output (short keys to minimize tokens)
4
  # 3) Python Code Executor — run Python code and capture stdout/errors
5
- # 4) Kokoro TTS — synthesize speech from text using Kokoro-82M
6
  # 5) Image Generation - HF serverless inference providers (Default: FLUX.1-Krea-dev)
7
  # 6) Video Generation - HF serverless inference providers (Default: Wan2.2-T2V-A14B)
8
 
@@ -457,6 +457,50 @@ _KOKORO_STATE = {
457
  }
458
 
459
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
460
  def _init_kokoro() -> None:
461
  """Lazy-initialize Kokoro model and pipelines on first use.
462
 
@@ -497,26 +541,57 @@ def _init_kokoro() -> None:
497
  )
498
 
499
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
500
  def Generate_Speech( # <-- MCP tool #4 (Generate Speech)
501
  text: Annotated[str, "The text to synthesize (English)."],
502
  speed: Annotated[float, "Speech speed multiplier in 0.5–2.0; 1.0 = normal speed."] = 1.25,
503
- voice: Annotated[str, "Voice identifier. Example: 'af_heart' (US English, female, Heart)."] = "af_heart",
504
  ) -> Tuple[int, np.ndarray]:
505
  """
506
- Synthesize speech from text using the Kokoro-82M model.
507
 
508
  This function returns raw audio suitable for a Gradio Audio component and is
509
- also exposed as an MCP tool (per the latest Hugging Face/Gradio MCP docs, a
510
- tool is created for each function wired into your app; docstrings and type
511
- hints are used to describe the tool).
512
 
513
  Default behavior:
514
  - Speed defaults to 1.25 (slightly brisk cadence) for clearer, snappier delivery.
 
515
 
516
  Args:
517
- text: The text to synthesize (English).
518
  speed: Speech speed multiplier in 0.5–2.0; 1.0 = normal speed. Default: 1.25 (slightly brisk).
519
- voice: Voice identifier. Example: 'af_heart' (US English, female, Heart).
 
 
520
 
521
  Returns:
522
  A tuple of (sample_rate_hz, audio_waveform) where:
@@ -524,9 +599,10 @@ def Generate_Speech( # <-- MCP tool #4 (Generate Speech)
524
  - audio_waveform: numpy.ndarray float32 mono waveform in range [-1, 1]
525
 
526
  Notes:
527
- - Requires the 'kokoro' package (>=0.9.4). If unavailable, an error is
528
- raised with installation guidance.
529
  - Runs on CUDA if available; otherwise CPU.
 
 
530
  """
531
  if not text or not text.strip():
532
  raise gr.Error("Please provide non-empty text to synthesize.")
@@ -640,7 +716,7 @@ CSS_STYLES = """
640
  /* Place bold tools list on line 2, normal auth note on line 3 (below title) */
641
  .gradio-container h1::before {
642
  grid-row: 2;
643
- content: "Fetch Webpage | Search DuckDuckGo | Code Interpreter | Kokoro TTS | Image Generation | Video Generation";
644
  display: block;
645
  font-size: 1rem;
646
  font-weight: 700;
@@ -667,21 +743,28 @@ CSS_STYLES = """
667
  """
668
 
669
  # --- Kokoro TTS tab (text to speech) ---
 
670
  kokoro_interface = gr.Interface(
671
  fn=Generate_Speech,
672
  inputs=[
673
  gr.Textbox(label="Text", placeholder="Type text to synthesize…", lines=4),
674
- gr.Slider(minimum=0.5, maximum=2.0, value=1.25, step=0.1, label="Speed"),
675
- gr.Textbox(label="Voice", value="af_heart", placeholder="e.g., af_heart"),
 
 
 
 
 
676
  ],
677
  outputs=gr.Audio(label="Audio", type="numpy"),
678
  title="Kokoro TTS",
679
  description=(
680
- "<div style=\"text-align:center\">Generate English speech with Kokoro-82M. 30 second max output. Runs on CPU or CUDA if available.</div>"
681
  ),
682
  api_description=(
683
- "Synthesize speech from text using Kokoro-82M. Returns (sample_rate, waveform) suitable for playback. "
684
- "Parameters: text (str), speed (float 0.5–2.0, default 1.25x), voice (str). "
 
685
  "Return the generated media to the user in this format `![Alt text](URL)`"
686
  ),
687
  allow_flagging="never",
 
1
+ # Purpose: One Space that offers six tools/tabs (all exposed as MCP tools):
2
  # 1) Fetch — extract relevant page content (title, metadata, clean text, hyperlinks)
3
  # 2) DuckDuckGo Search — compact JSONL search output (short keys to minimize tokens)
4
  # 3) Python Code Executor — run Python code and capture stdout/errors
5
+ # 4) Kokoro TTS — synthesize speech from text using Kokoro-82M with 54 voice options
6
  # 5) Image Generation - HF serverless inference providers (Default: FLUX.1-Krea-dev)
7
  # 6) Video Generation - HF serverless inference providers (Default: Wan2.2-T2V-A14B)
8
 
 
457
  }
458
 
459
 
460
+ def get_kokoro_voices():
461
+ """Get comprehensive list of available Kokoro voice IDs (54 total)."""
462
+ try:
463
+ from huggingface_hub import list_repo_files
464
+ # Get voice files from the Kokoro repository
465
+ files = list_repo_files('hexgrad/Kokoro-82M')
466
+ voice_files = [f for f in files if f.endswith('.pt') and f.startswith('voices/')]
467
+ voices = [f.replace('voices/', '').replace('.pt', '') for f in voice_files]
468
+ return sorted(voices) if voices else _get_fallback_voices()
469
+ except Exception:
470
+ return _get_fallback_voices()
471
+
472
+
473
+ def _get_fallback_voices():
474
+ """Return comprehensive fallback list of known Kokoro voices (54 total)."""
475
+ return [
476
+ # American Female (11 voices)
477
+ "af_alloy", "af_aoede", "af_bella", "af_heart", "af_jessica",
478
+ "af_kore", "af_nicole", "af_nova", "af_river", "af_sarah", "af_sky",
479
+ # American Male (9 voices)
480
+ "am_adam", "am_echo", "am_eric", "am_fenrir", "am_liam",
481
+ "am_michael", "am_onyx", "am_puck", "am_santa",
482
+ # British Female (4 voices)
483
+ "bf_alice", "bf_emma", "bf_isabella", "bf_lily",
484
+ # British Male (4 voices)
485
+ "bm_daniel", "bm_fable", "bm_george", "bm_lewis",
486
+ # European Female/Male (3 voices)
487
+ "ef_dora", "em_alex", "em_santa",
488
+ # French Female (1 voice)
489
+ "ff_siwis",
490
+ # Hindi Female/Male (4 voices)
491
+ "hf_alpha", "hf_beta", "hm_omega", "hm_psi",
492
+ # Italian Female/Male (2 voices)
493
+ "if_sara", "im_nicola",
494
+ # Japanese Female/Male (5 voices)
495
+ "jf_alpha", "jf_gongitsune", "jf_nezumi", "jf_tebukuro", "jm_kumo",
496
+ # Portuguese Female/Male (3 voices)
497
+ "pf_dora", "pm_alex", "pm_santa",
498
+ # Chinese Female/Male (8 voices)
499
+ "zf_xiaobei", "zf_xiaoni", "zf_xiaoxiao", "zf_xiaoyi",
500
+ "zm_yunjian", "zm_yunxi", "zm_yunxia", "zm_yunyang"
501
+ ]
502
+
503
+
504
  def _init_kokoro() -> None:
505
  """Lazy-initialize Kokoro model and pipelines on first use.
506
 
 
541
  )
542
 
543
 
544
+ def List_Kokoro_Voices() -> List[str]:
545
+ """
546
+ Get a list of all available Kokoro voice identifiers.
547
+
548
+ This MCP tool helps clients discover the 54 available voice options
549
+ for the Generate_Speech tool.
550
+
551
+ Returns:
552
+ List[str]: A list of voice identifiers (e.g., ["af_heart", "am_adam", "bf_alice", ...])
553
+
554
+ Voice naming convention:
555
+ - First 2 letters: Language/Region (af=American Female, am=American Male, bf=British Female, etc.)
556
+ - Following letters: Voice name (heart, adam, alice, etc.)
557
+
558
+ Available categories:
559
+ - American Female/Male (20 voices)
560
+ - British Female/Male (8 voices)
561
+ - European Female/Male (3 voices)
562
+ - French Female (1 voice)
563
+ - Hindi Female/Male (4 voices)
564
+ - Italian Female/Male (2 voices)
565
+ - Japanese Female/Male (5 voices)
566
+ - Portuguese Female/Male (3 voices)
567
+ - Chinese Female/Male (8 voices)
568
+ """
569
+ return get_kokoro_voices()
570
+
571
+
572
  def Generate_Speech( # <-- MCP tool #4 (Generate Speech)
573
  text: Annotated[str, "The text to synthesize (English)."],
574
  speed: Annotated[float, "Speech speed multiplier in 0.5–2.0; 1.0 = normal speed."] = 1.25,
575
+ voice: Annotated[str, "Voice identifier from 54 available options. Use List_Kokoro_Voices() to see all choices. Examples: 'af_heart' (US female), 'am_adam' (US male), 'bf_alice' (British female), 'jf_alpha' (Japanese female)."] = "af_heart",
576
  ) -> Tuple[int, np.ndarray]:
577
  """
578
+ Synthesize speech from text using the Kokoro-82M model with 54 voice options.
579
 
580
  This function returns raw audio suitable for a Gradio Audio component and is
581
+ also exposed as an MCP tool. It supports 54 different voices across multiple
582
+ languages and accents including American, British, European, Hindi, Italian,
583
+ Japanese, Portuguese, and Chinese speakers.
584
 
585
  Default behavior:
586
  - Speed defaults to 1.25 (slightly brisk cadence) for clearer, snappier delivery.
587
+ - Voice defaults to "af_heart" (American Female, Heart voice)
588
 
589
  Args:
590
+ text: The text to synthesize. Works best with English but supports multiple languages.
591
  speed: Speech speed multiplier in 0.5–2.0; 1.0 = normal speed. Default: 1.25 (slightly brisk).
592
+ voice: Voice identifier from 54 available options. Use List_Kokoro_Voices() to see all choices.
593
+ Examples: 'af_heart' (US female), 'am_adam' (US male), 'bf_alice' (British female),
594
+ 'jf_alpha' (Japanese female), 'zf_xiaobei' (Chinese female).
595
 
596
  Returns:
597
  A tuple of (sample_rate_hz, audio_waveform) where:
 
599
  - audio_waveform: numpy.ndarray float32 mono waveform in range [-1, 1]
600
 
601
  Notes:
602
+ - Requires the 'kokoro' package (>=0.9.4). If unavailable, an error is raised.
 
603
  - Runs on CUDA if available; otherwise CPU.
604
+ - Supports 54 voices across 9 language/accent categories.
605
+ - Use List_Kokoro_Voices() MCP tool to discover all available voice options.
606
  """
607
  if not text or not text.strip():
608
  raise gr.Error("Please provide non-empty text to synthesize.")
 
716
  /* Place bold tools list on line 2, normal auth note on line 3 (below title) */
717
  .gradio-container h1::before {
718
  grid-row: 2;
719
+ content: "Fetch Webpage | Search DuckDuckGo | Code Interpreter | Kokoro TTS (54 voices) | Image Generation | Video Generation";
720
  display: block;
721
  font-size: 1rem;
722
  font-weight: 700;
 
743
  """
744
 
745
  # --- Kokoro TTS tab (text to speech) ---
746
+ available_voices = get_kokoro_voices()
747
  kokoro_interface = gr.Interface(
748
  fn=Generate_Speech,
749
  inputs=[
750
  gr.Textbox(label="Text", placeholder="Type text to synthesize…", lines=4),
751
+ gr.Slider(minimum=0.5, maximum=2.0, value=1.25, step=0.1, label="Speed"),
752
+ gr.Dropdown(
753
+ label="Voice",
754
+ choices=available_voices,
755
+ value="af_heart",
756
+ info="Select from 54 available voices across multiple languages and accents"
757
+ ),
758
  ],
759
  outputs=gr.Audio(label="Audio", type="numpy"),
760
  title="Kokoro TTS",
761
  description=(
762
+ "<div style=\"text-align:center\">Generate speech with Kokoro-82M using 54 different voices. Supports multiple languages and accents. Runs on CPU or CUDA if available.</div>"
763
  ),
764
  api_description=(
765
+ "Synthesize speech from text using Kokoro-82M with 54 voice options. Returns (sample_rate, waveform) suitable for playback. "
766
+ "Parameters: text (str), speed (float 0.5–2.0, default 1.25x), voice (str from 54 available options). "
767
+ "Available voices include American/British/European/Hindi/Italian/Japanese/Portuguese/Chinese speakers. "
768
  "Return the generated media to the user in this format `![Alt text](URL)`"
769
  ),
770
  allow_flagging="never",