Nymbo commited on
Commit
d6038df
·
verified ·
1 Parent(s): 574e025

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -16
app.py CHANGED
@@ -582,16 +582,19 @@ def Generate_Speech( # <-- MCP tool #4 (Generate Speech)
582
  languages and accents including American, British, European, Hindi, Italian,
583
  Japanese, Portuguese, and Chinese speakers.
584
 
 
 
 
 
585
  Default behavior:
586
- - Speed defaults to 1.25 (slightly brisk cadence) for clearer, snappier delivery.
587
- - Voice defaults to "af_heart" (American Female, Heart voice)
588
 
589
  Args:
590
  text: The text to synthesize. Works best with English but supports multiple languages.
591
  speed: Speech speed multiplier in 0.5–2.0; 1.0 = normal speed. Default: 1.25 (slightly brisk).
592
  voice: Voice identifier from 54 available options. Use List_Kokoro_Voices() to see all choices.
593
- Examples: 'af_heart' (US female), 'am_adam' (US male), 'bf_alice' (British female),
594
- 'jf_alpha' (Japanese female), 'zf_xiaobei' (Chinese female).
595
 
596
  Returns:
597
  A tuple of (sample_rate_hz, audio_waveform) where:
@@ -615,19 +618,49 @@ def Generate_Speech( # <-- MCP tool #4 (Generate Speech)
615
  if pipeline is None:
616
  raise gr.Error("Kokoro English pipeline not initialized.")
617
 
 
 
618
  pack = pipeline.load_voice(voice)
619
- # Generate using the last reference state from the current phoneme sequence
620
- for _, ps, _ in pipeline(text, voice, speed):
621
- ref_s = pack[len(ps) - 1]
622
- try:
623
- audio = model(ps, ref_s, float(speed))
624
- except Exception as e: # propagate as UI-friendly error
625
- raise gr.Error(f"Error generating audio: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
626
  # Return 24 kHz mono waveform
627
- return 24_000, audio.detach().cpu().numpy()
628
-
629
- # If pipeline produced no segments
630
- raise gr.Error("No audio was generated (empty synthesis result).")
 
 
631
 
632
 
633
  # ======================
@@ -759,12 +792,13 @@ kokoro_interface = gr.Interface(
759
  outputs=gr.Audio(label="Audio", type="numpy"),
760
  title="Kokoro TTS",
761
  description=(
762
- "<div style=\"text-align:center\">Generate speech with Kokoro-82M using 54 different voices. Supports multiple languages and accents. Runs on CPU or CUDA if available.</div>"
763
  ),
764
  api_description=(
765
  "Synthesize speech from text using Kokoro-82M with 54 voice options. Returns (sample_rate, waveform) suitable for playback. "
766
  "Parameters: text (str), speed (float 0.5–2.0, default 1.25x), voice (str from 54 available options). "
767
  "Available voices include American/British/European/Hindi/Italian/Japanese/Portuguese/Chinese speakers. "
 
768
  "Return the generated media to the user in this format `![Alt text](URL)`"
769
  ),
770
  allow_flagging="never",
 
582
  languages and accents including American, British, European, Hindi, Italian,
583
  Japanese, Portuguese, and Chinese speakers.
584
 
585
+ Enhanced for longer audio generation:
586
+ - Can generate audio of any length based on input text
587
+ - Concatenates multiple segments for seamless longer audio
588
+
589
  Default behavior:
590
+ - Speed defaults to 1.25 (slightly brisk cadence).
591
+ - Voice defaults to "af_heart".
592
 
593
  Args:
594
  text: The text to synthesize. Works best with English but supports multiple languages.
595
  speed: Speech speed multiplier in 0.5–2.0; 1.0 = normal speed. Default: 1.25 (slightly brisk).
596
  voice: Voice identifier from 54 available options. Use List_Kokoro_Voices() to see all choices.
597
+ Examples: 'af_heart' (US female), 'am_adam' (US male), 'bf_bella' (British female),
 
598
 
599
  Returns:
600
  A tuple of (sample_rate_hz, audio_waveform) where:
 
618
  if pipeline is None:
619
  raise gr.Error("Kokoro English pipeline not initialized.")
620
 
621
+ # Process ALL segments for longer audio generation
622
+ audio_segments = []
623
  pack = pipeline.load_voice(voice)
624
+
625
+ try:
626
+ # Get all segments first to show progress for long text
627
+ segments = list(pipeline(text, voice, speed))
628
+ total_segments = len(segments)
629
+
630
+ # Iterate through ALL segments instead of just the first one
631
+ for segment_idx, (text_chunk, ps, _) in enumerate(segments):
632
+ ref_s = pack[len(ps) - 1]
633
+ try:
634
+ audio = model(ps, ref_s, float(speed))
635
+ audio_segments.append(audio.detach().cpu().numpy())
636
+
637
+ # For very long text (>10 segments), show progress every few segments
638
+ if total_segments > 10 and (segment_idx + 1) % 5 == 0:
639
+ print(f"Progress: Generated {segment_idx + 1}/{total_segments} segments...")
640
+
641
+ except Exception as e:
642
+ raise gr.Error(f"Error generating audio for segment {segment_idx + 1}: {str(e)}")
643
+
644
+ if not audio_segments:
645
+ raise gr.Error("No audio was generated (empty synthesis result).")
646
+
647
+ # Concatenate all segments to create the complete audio
648
+ if len(audio_segments) == 1:
649
+ final_audio = audio_segments[0]
650
+ else:
651
+ final_audio = np.concatenate(audio_segments, axis=0)
652
+ # For multi-segment audio, provide completion info
653
+ duration = len(final_audio) / 24_000
654
+ if total_segments > 1:
655
+ print(f"Completed: {total_segments} segments concatenated into {duration:.1f} seconds of audio")
656
+
657
  # Return 24 kHz mono waveform
658
+ return 24_000, final_audio
659
+
660
+ except gr.Error:
661
+ raise # Re-raise Gradio errors as-is
662
+ except Exception as e:
663
+ raise gr.Error(f"Error during speech generation: {str(e)}")
664
 
665
 
666
  # ======================
 
792
  outputs=gr.Audio(label="Audio", type="numpy"),
793
  title="Kokoro TTS",
794
  description=(
795
+ "<div style=\"text-align:center\">Generate speech with Kokoro-82M using 54 different voices. Supports multiple languages and accents. Can generate audio of any length! Runs on CPU or CUDA if available.</div>"
796
  ),
797
  api_description=(
798
  "Synthesize speech from text using Kokoro-82M with 54 voice options. Returns (sample_rate, waveform) suitable for playback. "
799
  "Parameters: text (str), speed (float 0.5–2.0, default 1.25x), voice (str from 54 available options). "
800
  "Available voices include American/British/European/Hindi/Italian/Japanese/Portuguese/Chinese speakers. "
801
+ "Can generate audio of unlimited length by processing all text segments. "
802
  "Return the generated media to the user in this format `![Alt text](URL)`"
803
  ),
804
  allow_flagging="never",