Spaces:

alyxsis
/

tts

Running

App Files Files Community

alyxsis commited on about 1 month ago

Commit

d0cf561

verified ·

1 Parent(s): 3f44b2e

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -45

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import gradio as gr
 import os
 import tempfile
 import time
@@ -154,13 +155,33 @@ def get_kokoro_voices():
             "zf_xiaobei", "zf_xiaoni", "zf_xiaoxiao", "zf_xiaoyi", "zm_yunjian", "zm_yunxi", "zm_yunxia", "zm_yunyang"
         ]
-def kokoro_tts(text: str, speed: float, voice: str) -> str:
-    """Generate speech with Kokoro-82M, supporting long text via segment stitching.
-    Replicates the long-audio behavior from Tools-MCP by generating audio for
-    every segment produced by the Kokoro pipeline and concatenating results.
-    Returns a path to a temporary WAV file (24 kHz mono).
-    """
     if not text or not text.strip():
         raise gr.Error("Please enter text to synthesize.")
@@ -171,72 +192,80 @@ def kokoro_tts(text: str, speed: float, voice: str) -> str:
     if pipeline is None:
         raise gr.Error("Kokoro English pipeline not initialized.")
-    sr = 24_000
-    # Process ALL segments for longer audio generation
-    audio_segments = []
     pack = pipeline.load_voice(voice)
     try:
-        segments = list(pipeline(text, voice, speed))
-        total_segments = len(segments)
-        for idx, (_, ps, _) in enumerate(segments):
             ref_s = pack[len(ps) - 1]
             try:
                 audio = model(ps, ref_s, float(speed))
-                audio_segments.append(audio.detach().cpu().numpy())
-                if total_segments > 10 and (idx + 1) % 5 == 0:
-                    print(f"Progress: Generated {idx + 1}/{total_segments} segments...")
             except Exception as e:
                 raise gr.Error(f"Error generating audio for segment {idx + 1}: {str(e)[:200]}...")
-        if not audio_segments:
-            raise gr.Error("No audio was generated.")
-        # Concatenate all segments to create the complete audio
-        if len(audio_segments) == 1:
-            audio_np = audio_segments[0]
-        else:
-            audio_np = np.concatenate(audio_segments, axis=0)
-            duration = len(audio_np) / sr
-            print(f"Completed: {total_segments} segments concatenated into {duration:.1f} seconds of audio")
     except gr.Error:
         raise
     except Exception as e:
         raise gr.Error(f"Error during speech generation: {str(e)[:200]}...")
-    # Convert to 16-bit PCM and write to WAV file
-    audio_clipped = np.clip(audio_np, -1.0, 1.0)
-    audio_int16 = (audio_clipped * 32767.0).astype(np.int16)
-    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
-        path = tmp.name
-    with wave.open(path, "wb") as wf:
-        wf.setnchannels(1)
-        wf.setsampwidth(2)
-        wf.setframerate(sr)
-        wf.writeframes(audio_int16.tobytes())
-    return path
 # Main dispatcher function to handle all services
 def generate_tts(text, service, openai_api_key, openai_model, openai_voice,
                  elevenlabs_api_key, elevenlabs_voice, voice_dict,
                  kokoro_speed, kokoro_voice):
     """Route to appropriate TTS service based on selection"""
     if service == "OpenAI":
-        return openai_tts(text, openai_model, openai_voice, openai_api_key)
     elif service == "ElevenLabs":
         voice_id = voice_dict.get(elevenlabs_voice, elevenlabs_voice)
-        return elevenlabs_tts(text, voice_id, elevenlabs_api_key)
-    elif service == "Kokoro":
-        return kokoro_tts(text, kokoro_speed, kokoro_voice)
     else:
-        # Fallback in case of an unknown service
         raise gr.Error(f"Unknown service selected: {service}")
 # Function to update ElevenLabs voices when API key changes
 def update_elevenlabs_voices(api_key):
     """Update voice dropdown when API key is entered"""
@@ -341,6 +370,9 @@ with gr.Blocks(theme='Nymbo/Alyx_Theme') as demo:
     audio_output = gr.Audio(
         label="Generated Speech",
     )
     # ==========================

 import gradio as gr
+import io
 import os
 import tempfile
 import time
             "zf_xiaobei", "zf_xiaoni", "zf_xiaoxiao", "zf_xiaoyi", "zm_yunjian", "zm_yunxi", "zm_yunxia", "zm_yunyang"
         ]
+def _audio_np_to_int16(audio_np: np.ndarray) -> np.ndarray:
+    audio_clipped = np.clip(audio_np, -1.0, 1.0)
+    return (audio_clipped * 32767.0).astype(np.int16)
+def _write_wav_file(audio_int16: np.ndarray, sample_rate: int = 24_000) -> str:
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+        path = tmp.name
+    with wave.open(path, "wb") as wf:
+        wf.setnchannels(1)
+        wf.setsampwidth(2)
+        wf.setframerate(sample_rate)
+        wf.writeframes(audio_int16.tobytes())
+    return path
+def _wav_bytes_from_int16(audio_int16: np.ndarray, sample_rate: int = 24_000) -> bytes:
+    buffer = io.BytesIO()
+    with wave.open(buffer, "wb") as wf:
+        wf.setnchannels(1)
+        wf.setsampwidth(2)
+        wf.setframerate(sample_rate)
+        wf.writeframes(audio_int16.tobytes())
+    return buffer.getvalue()
+def _kokoro_segment_generator(text: str, speed: float, voice: str):
     if not text or not text.strip():
         raise gr.Error("Please enter text to synthesize.")
     if pipeline is None:
         raise gr.Error("Kokoro English pipeline not initialized.")
     pack = pipeline.load_voice(voice)
     try:
+        for idx, (_, ps, _) in enumerate(pipeline(text, voice, speed)):
             ref_s = pack[len(ps) - 1]
             try:
                 audio = model(ps, ref_s, float(speed))
+                audio_np = audio.detach().cpu().numpy()
+                yield audio_np
             except Exception as e:
                 raise gr.Error(f"Error generating audio for segment {idx + 1}: {str(e)[:200]}...")
     except gr.Error:
         raise
     except Exception as e:
         raise gr.Error(f"Error during speech generation: {str(e)[:200]}...")
+def kokoro_tts(text: str, speed: float, voice: str) -> str:
+    sr = 24_000
+    segments = list(_kokoro_segment_generator(text, speed, voice))
+    if not segments:
+        raise gr.Error("No audio was generated.")
+    audio_np = segments[0] if len(segments) == 1 else np.concatenate(segments, axis=0)
+    audio_int16 = _audio_np_to_int16(audio_np)
+    return _write_wav_file(audio_int16, sr)
+def kokoro_tts_stream(text: str, speed: float, voice: str):
+    sr = 24_000
+    produced_any = False
+    for audio_np in _kokoro_segment_generator(text, speed, voice):
+        produced_any = True
+        audio_int16 = _audio_np_to_int16(audio_np)
+        chunk_bytes = _wav_bytes_from_int16(audio_int16, sr)
+        yield chunk_bytes
+    if not produced_any:
+        raise gr.Error("No audio was generated.")
 # Main dispatcher function to handle all services
+def _read_file_bytes(path: str) -> bytes:
+    with open(path, "rb") as file:
+        data = file.read()
+    return data
 def generate_tts(text, service, openai_api_key, openai_model, openai_voice,
                  elevenlabs_api_key, elevenlabs_voice, voice_dict,
                  kokoro_speed, kokoro_voice):
     """Route to appropriate TTS service based on selection"""
+    if service == "Kokoro":
+        yield from kokoro_tts_stream(text, kokoro_speed, kokoro_voice)
+        return
     if service == "OpenAI":
+        file_path = openai_tts(text, openai_model, openai_voice, openai_api_key)
     elif service == "ElevenLabs":
         voice_id = voice_dict.get(elevenlabs_voice, elevenlabs_voice)
+        file_path = elevenlabs_tts(text, voice_id, elevenlabs_api_key)
     else:
         raise gr.Error(f"Unknown service selected: {service}")
+    try:
+        audio_bytes = _read_file_bytes(file_path)
+    finally:
+        try:
+            os.remove(file_path)
+        except OSError:
+            pass
+    yield audio_bytes
 # Function to update ElevenLabs voices when API key changes
 def update_elevenlabs_voices(api_key):
     """Update voice dropdown when API key is entered"""
     audio_output = gr.Audio(
         label="Generated Speech",
+        streaming=True,
+        autoplay=True,
+        show_download_button=True,
     )
     # ==========================