Spaces:

sitatech
/

vibe-shopping

Running

App Files Files Community

vibe-shopping / mcp_host /tts /hf_zero_gpu_tts.py

sitatech

Add notes

98a160d 5 months ago

raw

history blame

2.31 kB

	from typing import Generator, Iterator

	import numpy as np
	import torch
	import spaces
	from kokoro import KPipeline, KModel
	from stream2sentence import generate_sentences

	from mcp_host.tts.utils import KOKORO_TO_STD_LANG, VOICES

	__all__ = ["stream_text_to_speech"]


	device = 0 if torch.cuda.is_available() else "cpu"
	model = KModel().to(device).eval()

	# Create a pipeline for each language. Kokoro language codes:
	# 🇺🇸 'a' => American English, 🇬🇧 'b' => British English
	# 🇪🇸 'e' => Spanish es
	# 🇫🇷 'f' => French fr-fr
	# 🇮🇳 'h' => Hindi hi
	# 🇮🇹 'i' => Italian it
	# 🇯🇵 'j' => Japanese: pip install misaki[ja]
	# 🇧🇷 'p' => Brazilian Portuguese pt-br
	# 🇨🇳 'z' => Mandarin Chinese: pip install misaki[zh]
	pipes = {
	lang_code: KPipeline(lang_code=lang_code, model=model, device=device)
	for lang_code in "abzefhip"
	# for lang_code in "abjzefhip"
	}

	# Preload voices into pipelines
	for voice_code in VOICES.values():
	# First letter of the voice code is the language code (kokoro format)
	lang_code = voice_code[0]
	if lang_code in pipes:
	pipes[lang_code].load_voice(voice_code)


	def stream_text_to_speech(
	text_stream: Iterator[str], voice: str \| None = None
	) -> Generator[tuple[int, np.ndarray], None, None]:
	"""
	Convert text to speech using the specified voice.

	Args:
	text (str): The text to convert to speech.
	voice (str): The voice to use for the conversion. Default to af_heart

	Returns:
	np.ndarray: The audio as a NumPy array.
	"""
	voice = voice or "af_heart"
	if voice not in VOICES.values():
	raise ValueError(f"Voice '{voice}' is not available.")

	kokoro_lang = voice[0]
	standard_lang_code = KOKORO_TO_STD_LANG[kokoro_lang]

	for text in generate_sentences(
	text_stream, language=standard_lang_code, full_sentence_delimiters=".?!:\n…。"
	):
	text = text.strip()
	print(f"Streaming audio for text: {text}")
	for audio in text_to_speech(text, pipe_key=kokoro_lang, voice=voice):
	yield 24000, audio


	@spaces.GPU(duration=10)
	def text_to_speech(
	text: str,
	pipe_key: str,
	voice: str \| None = None,
	):
	for _, __, audio in pipes[pipe_key](text, voice=voice):
	yield audio.numpy()