Spaces:
Running
Running
| from typing import Generator, Iterator | |
| import numpy as np | |
| import torch | |
| import spaces | |
| from kokoro import KPipeline, KModel | |
| from stream2sentence import generate_sentences | |
| from mcp_host.tts.utils import KOKORO_TO_STD_LANG, VOICES | |
| __all__ = ["stream_text_to_speech"] | |
| device = 0 if torch.cuda.is_available() else "cpu" | |
| model = KModel().to(device).eval() | |
| # Create a pipeline for each language. Kokoro language codes: | |
| # 🇺🇸 'a' => American English, 🇬🇧 'b' => British English | |
| # 🇪🇸 'e' => Spanish es | |
| # 🇫🇷 'f' => French fr-fr | |
| # 🇮🇳 'h' => Hindi hi | |
| # 🇮🇹 'i' => Italian it | |
| # 🇯🇵 'j' => Japanese: pip install misaki[ja] | |
| # 🇧🇷 'p' => Brazilian Portuguese pt-br | |
| # 🇨🇳 'z' => Mandarin Chinese: pip install misaki[zh] | |
| pipes = { | |
| lang_code: KPipeline(lang_code=lang_code, model=model, device=device) | |
| for lang_code in "abzefhip" | |
| # for lang_code in "abjzefhip" | |
| } | |
| # Preload voices into pipelines | |
| for voice_code in VOICES.values(): | |
| # First letter of the voice code is the language code (kokoro format) | |
| lang_code = voice_code[0] | |
| if lang_code in pipes: | |
| pipes[lang_code].load_voice(voice_code) | |
| def stream_text_to_speech( | |
| text_stream: Iterator[str], voice: str | None = None | |
| ) -> Generator[tuple[int, np.ndarray], None, None]: | |
| """ | |
| Convert text to speech using the specified voice. | |
| Args: | |
| text (str): The text to convert to speech. | |
| voice (str): The voice to use for the conversion. Default to af_heart | |
| Returns: | |
| np.ndarray: The audio as a NumPy array. | |
| """ | |
| voice = voice or "af_heart" | |
| if voice not in VOICES.values(): | |
| raise ValueError(f"Voice '{voice}' is not available.") | |
| kokoro_lang = voice[0] | |
| standard_lang_code = KOKORO_TO_STD_LANG[kokoro_lang] | |
| for text in generate_sentences( | |
| text_stream, language=standard_lang_code, full_sentence_delimiters=".?!:\n…。" | |
| ): | |
| text = text.strip() | |
| print(f"Streaming audio for text: {text}") | |
| for audio in text_to_speech(text, pipe_key=kokoro_lang, voice=voice): | |
| yield 24000, audio | |
| def text_to_speech( | |
| text: str, | |
| pipe_key: str, | |
| voice: str | None = None, | |
| ): | |
| for _, __, audio in pipes[pipe_key](text, voice=voice): | |
| yield audio.numpy() | |