Spaces:

Nymbo
/

Tools

Running

App Files Files

xet

Community

Nymbo commited on Aug 25

Commit

d6038df

verified ·

1 Parent(s): 574e025

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -16

app.py CHANGED Viewed

@@ -582,16 +582,19 @@ def Generate_Speech(  # <-- MCP tool #4 (Generate Speech)
     languages and accents including American, British, European, Hindi, Italian,
     Japanese, Portuguese, and Chinese speakers.
     Default behavior:
-        - Speed defaults to 1.25 (slightly brisk cadence) for clearer, snappier delivery.
-        - Voice defaults to "af_heart" (American Female, Heart voice)
     Args:
         text: The text to synthesize. Works best with English but supports multiple languages.
         speed: Speech speed multiplier in 0.5–2.0; 1.0 = normal speed. Default: 1.25 (slightly brisk).
         voice: Voice identifier from 54 available options. Use List_Kokoro_Voices() to see all choices.
-               Examples: 'af_heart' (US female), 'am_adam' (US male), 'bf_alice' (British female),
-               'jf_alpha' (Japanese female), 'zf_xiaobei' (Chinese female).
     Returns:
         A tuple of (sample_rate_hz, audio_waveform) where:
@@ -615,19 +618,49 @@ def Generate_Speech(  # <-- MCP tool #4 (Generate Speech)
     if pipeline is None:
         raise gr.Error("Kokoro English pipeline not initialized.")
     pack = pipeline.load_voice(voice)
-    # Generate using the last reference state from the current phoneme sequence
-    for _, ps, _ in pipeline(text, voice, speed):
-        ref_s = pack[len(ps) - 1]
-        try:
-            audio = model(ps, ref_s, float(speed))
-        except Exception as e:  # propagate as UI-friendly error
-            raise gr.Error(f"Error generating audio: {str(e)}")
         # Return 24 kHz mono waveform
-        return 24_000, audio.detach().cpu().numpy()
-    # If pipeline produced no segments
-    raise gr.Error("No audio was generated (empty synthesis result).")
 # ======================
@@ -759,12 +792,13 @@ kokoro_interface = gr.Interface(
     outputs=gr.Audio(label="Audio", type="numpy"),
     title="Kokoro TTS",
     description=(
-        "<div style=\"text-align:center\">Generate speech with Kokoro-82M using 54 different voices. Supports multiple languages and accents. Runs on CPU or CUDA if available.</div>"
     ),
     api_description=(
         "Synthesize speech from text using Kokoro-82M with 54 voice options. Returns (sample_rate, waveform) suitable for playback. "
         "Parameters: text (str), speed (float 0.5–2.0, default 1.25x), voice (str from 54 available options). "
         "Available voices include American/British/European/Hindi/Italian/Japanese/Portuguese/Chinese speakers. "
         "Return the generated media to the user in this format `![Alt text](URL)`"
     ),
     allow_flagging="never",

     languages and accents including American, British, European, Hindi, Italian,
     Japanese, Portuguese, and Chinese speakers.
+    Enhanced for longer audio generation:
+        - Can generate audio of any length based on input text
+        - Concatenates multiple segments for seamless longer audio
     Default behavior:
+        - Speed defaults to 1.25 (slightly brisk cadence).
+        - Voice defaults to "af_heart".
     Args:
         text: The text to synthesize. Works best with English but supports multiple languages.
         speed: Speech speed multiplier in 0.5–2.0; 1.0 = normal speed. Default: 1.25 (slightly brisk).
         voice: Voice identifier from 54 available options. Use List_Kokoro_Voices() to see all choices.
+               Examples: 'af_heart' (US female), 'am_adam' (US male), 'bf_bella' (British female),
     Returns:
         A tuple of (sample_rate_hz, audio_waveform) where:
     if pipeline is None:
         raise gr.Error("Kokoro English pipeline not initialized.")
+    # Process ALL segments for longer audio generation
+    audio_segments = []
     pack = pipeline.load_voice(voice)
+    try:
+        # Get all segments first to show progress for long text
+        segments = list(pipeline(text, voice, speed))
+        total_segments = len(segments)
+        # Iterate through ALL segments instead of just the first one
+        for segment_idx, (text_chunk, ps, _) in enumerate(segments):
+            ref_s = pack[len(ps) - 1]
+            try:
+                audio = model(ps, ref_s, float(speed))
+                audio_segments.append(audio.detach().cpu().numpy())
+                # For very long text (>10 segments), show progress every few segments
+                if total_segments > 10 and (segment_idx + 1) % 5 == 0:
+                    print(f"Progress: Generated {segment_idx + 1}/{total_segments} segments...")
+            except Exception as e:
+                raise gr.Error(f"Error generating audio for segment {segment_idx + 1}: {str(e)}")
+        if not audio_segments:
+            raise gr.Error("No audio was generated (empty synthesis result).")
+        # Concatenate all segments to create the complete audio
+        if len(audio_segments) == 1:
+            final_audio = audio_segments[0]
+        else:
+            final_audio = np.concatenate(audio_segments, axis=0)
+            # For multi-segment audio, provide completion info
+            duration = len(final_audio) / 24_000
+            if total_segments > 1:
+                print(f"Completed: {total_segments} segments concatenated into {duration:.1f} seconds of audio")
         # Return 24 kHz mono waveform
+        return 24_000, final_audio
+    except gr.Error:
+        raise  # Re-raise Gradio errors as-is
+    except Exception as e:
+        raise gr.Error(f"Error during speech generation: {str(e)}")
 # ======================
     outputs=gr.Audio(label="Audio", type="numpy"),
     title="Kokoro TTS",
     description=(
+        "<div style=\"text-align:center\">Generate speech with Kokoro-82M using 54 different voices. Supports multiple languages and accents. Can generate audio of any length! Runs on CPU or CUDA if available.</div>"
     ),
     api_description=(
         "Synthesize speech from text using Kokoro-82M with 54 voice options. Returns (sample_rate, waveform) suitable for playback. "
         "Parameters: text (str), speed (float 0.5–2.0, default 1.25x), voice (str from 54 available options). "
         "Available voices include American/British/European/Hindi/Italian/Japanese/Portuguese/Chinese speakers. "
+        "Can generate audio of unlimited length by processing all text segments. "
         "Return the generated media to the user in this format `![Alt text](URL)`"
     ),
     allow_flagging="never",