Spaces:
Running
Running
File size: 4,151 Bytes
a132885 11bfd4b f078cf1 11bfd4b a132885 11bfd4b a132885 11bfd4b a132885 f078cf1 11bfd4b a132885 11bfd4b a132885 11bfd4b a132885 11bfd4b a132885 11bfd4b a132885 f078cf1 a132885 6d86d13 11bfd4b a132885 11bfd4b a132885 f078cf1 6d86d13 f078cf1 a132885 f078cf1 a132885 f078cf1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import gradio as gr
import torch
import numpy as np
import os
import io
import base64
from kokoro import KModel, KPipeline
# Check if CUDA is available
CUDA_AVAILABLE = torch.cuda.is_available()
# Initialize the model
model = KModel().to('cuda' if CUDA_AVAILABLE else 'cpu').eval()
# Initialize pipelines for different language codes (using 'a' for English)
pipelines = {'a': KPipeline(lang_code='a', model=False)}
# Custom pronunciation for "kokoro"
pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kˈOkəɹO'
def text_to_audio(text, speed=1.0):
"""Convert text to audio using Kokoro model.
Args:
text: The text to convert to speech
speed: Speech speed multiplier (0.5-2.0, where 1.0 is normal speed)
Returns:
Audio data as a tuple of (sample_rate, audio_array)
"""
if not text:
return None
pipeline = pipelines['a'] # Use English pipeline
voice = "af_heart" # Default voice (US English, female, Heart)
# Process the text
pack = pipeline.load_voice(voice)
for _, ps, _ in pipeline(text, voice, speed):
ref_s = pack[len(ps)-1]
# Generate audio
try:
audio = model(ps, ref_s, speed)
except Exception as e:
raise gr.Error(f"Error generating audio: {str(e)}")
# Return the audio with 24kHz sample rate
return 24000, audio.numpy()
return None
def text_to_audio_b64(text, speed=1.0):
"""Convert text to audio and return as base64 encoded WAV file.
Args:
text: The text to convert to speech
speed: Speech speed multiplier (0.5-2.0, where 1.0 is normal speed)
Returns:
Base64 encoded WAV file as a string
"""
import soundfile as sf
result = text_to_audio(text, speed)
if result is None:
return None
sample_rate, audio_data = result
# Save to BytesIO object
wav_io = io.BytesIO()
sf.write(wav_io, audio_data, sample_rate, format='WAV')
wav_io.seek(0)
# Convert to base64
wav_b64 = base64.b64encode(wav_io.read()).decode('utf-8')
return wav_b64
# Create Gradio interface
with gr.Blocks(title="Kokoro Text-to-Audio MCP") as app:
gr.Markdown("# 🎵 Kokoro Text-to-Audio MCP")
gr.Markdown("Convert text to speech using the Kokoro-82M model")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Enter your text",
placeholder="Type something to convert to audio...",
lines=5
)
speed_slider = gr.Slider(
minimum=0.5,
maximum=2.0,
value=1.0,
step=0.1,
label="Speech Speed"
)
submit_btn = gr.Button("Generate Audio")
with gr.Column():
audio_output = gr.Audio(label="Generated Audio", type="numpy")
submit_btn.click(
fn=text_to_audio,
inputs=[text_input, speed_slider],
outputs=[audio_output]
)
gr.Markdown("### Usage Tips")
gr.Markdown("- Adjust the speed slider to modify the pace of speech")
# Add section about MCP support
with gr.Accordion("MCP Support (for LLMs)", open=False):
gr.Markdown("""
### MCP Support
This app supports the Model Context Protocol (MCP), allowing Large Language Models like Claude Desktop to use it as a tool.
To use this app with an MCP client, add the following configuration:
```json
{
"mcpServers": {
"kokoroTTS": {
"url": "https://fdaudens-kokoro-mcp.hf.space/gradio_api/mcp/sse"
}
}
}
```
Replace `your-app-url.hf.space` with your actual Hugging Face Space URL.
""")
# Launch the app with MCP support
if __name__ == "__main__":
# Check for environment variable to enable MCP
enable_mcp = os.environ.get('GRADIO_MCP_SERVER', 'False').lower() in ('true', '1', 't')
app.launch(mcp_server=True) |