vibe-shopping / mcp_host /tts /hf_zero_gpu_tts.py
sitatech's picture
Add notes
98a160d
raw
history blame
2.31 kB
from typing import Generator, Iterator
import numpy as np
import torch
import spaces
from kokoro import KPipeline, KModel
from stream2sentence import generate_sentences
from mcp_host.tts.utils import KOKORO_TO_STD_LANG, VOICES
__all__ = ["stream_text_to_speech"]
device = 0 if torch.cuda.is_available() else "cpu"
model = KModel().to(device).eval()
# Create a pipeline for each language. Kokoro language codes:
# 🇺🇸 'a' => American English, 🇬🇧 'b' => British English
# 🇪🇸 'e' => Spanish es
# 🇫🇷 'f' => French fr-fr
# 🇮🇳 'h' => Hindi hi
# 🇮🇹 'i' => Italian it
# 🇯🇵 'j' => Japanese: pip install misaki[ja]
# 🇧🇷 'p' => Brazilian Portuguese pt-br
# 🇨🇳 'z' => Mandarin Chinese: pip install misaki[zh]
pipes = {
lang_code: KPipeline(lang_code=lang_code, model=model, device=device)
for lang_code in "abzefhip"
# for lang_code in "abjzefhip"
}
# Preload voices into pipelines
for voice_code in VOICES.values():
# First letter of the voice code is the language code (kokoro format)
lang_code = voice_code[0]
if lang_code in pipes:
pipes[lang_code].load_voice(voice_code)
def stream_text_to_speech(
text_stream: Iterator[str], voice: str | None = None
) -> Generator[tuple[int, np.ndarray], None, None]:
"""
Convert text to speech using the specified voice.
Args:
text (str): The text to convert to speech.
voice (str): The voice to use for the conversion. Default to af_heart
Returns:
np.ndarray: The audio as a NumPy array.
"""
voice = voice or "af_heart"
if voice not in VOICES.values():
raise ValueError(f"Voice '{voice}' is not available.")
kokoro_lang = voice[0]
standard_lang_code = KOKORO_TO_STD_LANG[kokoro_lang]
for text in generate_sentences(
text_stream, language=standard_lang_code, full_sentence_delimiters=".?!:\n…。"
):
text = text.strip()
print(f"Streaming audio for text: {text}")
for audio in text_to_speech(text, pipe_key=kokoro_lang, voice=voice):
yield 24000, audio
@spaces.GPU(duration=10)
def text_to_speech(
text: str,
pipe_key: str,
voice: str | None = None,
):
for _, __, audio in pipes[pipe_key](text, voice=voice):
yield audio.numpy()