####################################################################################### # # MIT License # # Copyright (c) [2025] [leonelhs@gmail.com] # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # ####################################################################################### # This file implements an API endpoint for the English Kokoro Text-to-Speech (TTS) system. # It provides functionality to generate TTS audio from input English text using the Kokoro voice model. # Source code is based on or inspired by several projects. # For more details and proper attribution, please refer to the following resources: # # - [Kokoro] - [https://github.com/hexgrad/kokoro] # - [Misaki] - [https://github.com/hexgrad/misaki] # - [Kokoro-82M] - [https://huggingface.co/hexgrad/Kokoro-82M] # - [Kokoro-onnx] - [https://github.com/thewh1teagle/kokoro-onnx] import os import gradio as gr from huggingface_hub import snapshot_download from kokoro_onnx import Kokoro from misaki import en, espeak KOKORO_REPO_ID = "leonelhs/kokoro-thewh1teagle" VOICES = { 'πŸ‡ΊπŸ‡Έ 🚺 Heart ❀️': 'af_heart', 'πŸ‡ΊπŸ‡Έ 🚺 Bella πŸ”₯': 'af_bella', 'πŸ‡ΊπŸ‡Έ 🚺 Nicole 🎧': 'af_nicole', 'πŸ‡ΊπŸ‡Έ 🚺 Aoede': 'af_aoede', 'πŸ‡ΊπŸ‡Έ 🚺 Kore': 'af_kore', 'πŸ‡ΊπŸ‡Έ 🚺 Sarah': 'af_sarah', 'πŸ‡ΊπŸ‡Έ 🚺 Nova': 'af_nova', 'πŸ‡ΊπŸ‡Έ 🚺 Sky': 'af_sky', 'πŸ‡ΊπŸ‡Έ 🚺 Alloy': 'af_alloy', 'πŸ‡ΊπŸ‡Έ 🚺 Jessica': 'af_jessica', 'πŸ‡ΊπŸ‡Έ 🚺 River': 'af_river', 'πŸ‡ΊπŸ‡Έ 🚹 Michael': 'am_michael', 'πŸ‡ΊπŸ‡Έ 🚹 Fenrir': 'am_fenrir', 'πŸ‡ΊπŸ‡Έ 🚹 Puck': 'am_puck', 'πŸ‡ΊπŸ‡Έ 🚹 Echo': 'am_echo', 'πŸ‡ΊπŸ‡Έ 🚹 Eric': 'am_eric', 'πŸ‡ΊπŸ‡Έ 🚹 Liam': 'am_liam', 'πŸ‡ΊπŸ‡Έ 🚹 Onyx': 'am_onyx', 'πŸ‡ΊπŸ‡Έ 🚹 Santa': 'am_santa', 'πŸ‡ΊπŸ‡Έ 🚹 Adam': 'am_adam', 'πŸ‡¬πŸ‡§ 🚺 Emma': 'bf_emma', 'πŸ‡¬πŸ‡§ 🚺 Isabella': 'bf_isabella', 'πŸ‡¬πŸ‡§ 🚺 Alice': 'bf_alice', 'πŸ‡¬πŸ‡§ 🚺 Lily': 'bf_lily', 'πŸ‡¬πŸ‡§ 🚹 George': 'bm_george', 'πŸ‡¬πŸ‡§ 🚹 Fable': 'bm_fable', 'πŸ‡¬πŸ‡§ 🚹 Lewis': 'bm_lewis', 'πŸ‡¬πŸ‡§ 🚹 Daniel': 'bm_daniel', } snapshot = snapshot_download(repo_id=KOKORO_REPO_ID) # Misaki G2P with espeak-ng fallback fallback = espeak.EspeakFallback(british=False) g2p = en.G2P(trf=False, british=False, fallback=fallback) # Kokoro model_path = os.path.join(snapshot, "kokoro-v1.0.onnx") voices_path = os.path.join(snapshot, "voices-v1.0.bin") kokoro = Kokoro(model_path, voices_path) def predict(text, voice='af_heart', speed=1): """ Generate speech audio from english text input. Parameters: text (string): The text to be converted into speech. voice (string): The selected male of female voice profile (specific voice ID). speed (float): The speaking rate multiplier (e.g., 1.0 = normal speed, 0.8 = slower, 1.2 = faster). Returns: path: File path to the generated audio speech. """ phonemes, _ = g2p(text) samples, sample_rate = kokoro.create(phonemes, voice, speed, is_phonemes=True) return sample_rate, samples app = gr.Interface( predict, [ gr.Textbox(label='Input Text'), gr.Dropdown(list(VOICES.items()), value='af_heart', label='Voice'), gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='Speed') ], gr.Audio(label='Output Audio', interactive=False, streaming=False, autoplay=True), description="Kokoro TTS πŸ‡ΊπŸ‡Έ πŸ‡¬πŸ‡§ API Endpoint", ) app.launch(share=False, debug=True, show_error=True, mcp_server=True) app.queue()