Spaces:

sasan
/

KITT

Build error

App Files Files Community

KITT / core /__init__.py

sasan

chore: Update user ID and group ID in devcontainer Dockerfile, update .gitignore to ignore __pycache__

4438ccd 6 months ago

raw

history blame

6.12 kB

	import os
	from collections import namedtuple
	import time
	import pathlib
	from typing import List

	import numpy as np
	import torch
	from TTS.api import TTS

	os.environ["COQUI_TOS_AGREED"] = "1"


	Voice = namedtuple("voice", ["name", "neutral", "angry", "speed"])

	file_full_path = pathlib.Path(os.path.realpath(__file__)).parent

	voices = [
	Voice(
	"Attenborough",
	neutral=f"{file_full_path}/audio/attenborough/neutral.wav",
	angry=None,
	speed=1.1,
	),
	Voice(
	"Rick",
	neutral=f"{file_full_path}/audio/rick/neutral.wav",
	angry=None,
	speed=1.1,
	),
	Voice(
	"Freeman",
	neutral=f"{file_full_path}/audio/freeman/neutral.wav",
	angry="audio/freeman/angry.wav",
	speed=1.1,
	),
	Voice(
	"Walken",
	neutral=f"{file_full_path}/audio/walken/neutral.wav",
	angry=None,
	speed=1.1,
	),
	Voice(
	"Darth Wader",
	neutral=f"{file_full_path}/audio/darth/neutral.wav",
	angry=None,
	speed=1.1,
	),
	]


	def load_tts_pipeline():
	# load model for text to speech
	device = "cuda" if torch.cuda.is_available() else "cpu"
	# device = "mps"
	tts_pipeline = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
	return tts_pipeline


	def compute_speaker_embedding(voice_path: str, config, pipeline, cache):
	if voice_path not in cache:
	cache[voice_path] = pipeline.synthesizer.tts_model.get_conditioning_latents(
	audio_path=voice_path,
	gpt_cond_len=config.gpt_cond_len,
	gpt_cond_chunk_len=config.gpt_cond_chunk_len,
	max_ref_length=config.max_ref_len,
	sound_norm_refs=config.sound_norm_refs,
	)
	return cache[voice_path]


	voice_options = []
	for voice in voices:
	if voice.neutral:
	voice_options.append(f"{voice.name} - Neutral")
	if voice.angry:
	voice_options.append(f"{voice.name} - Angry")


	def voice_from_text(voice):
	for v in voices:
	if voice == f"{v.name} - Neutral":
	return v.neutral
	if voice == f"{v.name} - Angry":
	return v.angry
	raise ValueError(f"Voice {voice} not found.")


	def speed_from_text(voice):
	for v in voices:
	if voice == f"{v.name} - Neutral":
	return v.speed
	if voice == f"{v.name} - Angry":
	return v.speed


	def tts(
	self,
	text: str = "",
	language_name: str = "",
	reference_wav=None,
	gpt_cond_latent=None,
	speaker_embedding=None,
	split_sentences: bool = True,
	**kwargs,
	) -> List[int]:
	"""🐸 TTS magic. Run all the models and generate speech.

	Args:
	text (str): input text.
	speaker_name (str, optional): speaker id for multi-speaker models. Defaults to "".
	language_name (str, optional): language id for multi-language models. Defaults to "".
	speaker_wav (Union[str, List[str]], optional): path to the speaker wav for voice cloning. Defaults to None.
	style_wav ([type], optional): style waveform for GST. Defaults to None.
	style_text ([type], optional): transcription of style_wav for Capacitron. Defaults to None.
	reference_wav ([type], optional): reference waveform for voice conversion. Defaults to None.
	reference_speaker_name ([type], optional): speaker id of reference waveform. Defaults to None.
	split_sentences (bool, optional): split the input text into sentences. Defaults to True.
	**kwargs: additional arguments to pass to the TTS model.
	Returns:
	List[int]: [description]
	"""
	start_time = time.time()
	use_gl = self.vocoder_model is None
	wavs = []

	if not text and not reference_wav:
	raise ValueError(
	"You need to define either `text` (for sythesis) or a `reference_wav` (for voice conversion) to use the Coqui TTS API."
	)

	if text:
	sens = [text]
	if split_sentences:
	print(" > Text splitted to sentences.")
	sens = self.split_into_sentences(text)
	print(sens)

	if not reference_wav: # not voice conversion
	for sen in sens:
	outputs = self.tts_model.inference(
	sen,
	language_name,
	gpt_cond_latent,
	speaker_embedding,
	# GPT inference
	temperature=0.75,
	length_penalty=1.0,
	repetition_penalty=10.0,
	top_k=50,
	top_p=0.85,
	do_sample=True,
	**kwargs,
	)
	waveform = outputs["wav"]
	if (
	torch.is_tensor(waveform)
	and waveform.device != torch.device("cpu")
	and not use_gl
	):
	waveform = waveform.cpu()
	if not use_gl:
	waveform = waveform.numpy()
	waveform = waveform.squeeze()

	# # trim silence
	# if (
	# "do_trim_silence" in self.tts_config.audio
	# and self.tts_config.audio["do_trim_silence"]
	# ):
	# waveform = trim_silence(waveform, self.tts_model.ap)

	wavs += list(waveform)
	wavs += [0] * 10000

	# compute stats
	process_time = time.time() - start_time
	audio_time = len(wavs) / self.tts_config.audio["sample_rate"]
	print(f" > Processing time: {process_time}")
	print(f" > Real-time factor: {process_time / audio_time}")
	return wavs


	def tts_gradio(tts_pipeline, text, voice, cache):
	voice_path = voice_from_text(voice)
	(gpt_cond_latent, speaker_embedding) = compute_speaker_embedding(
	voice_path, tts_pipeline.synthesizer.tts_config, tts_pipeline, cache
	)
	out = tts(
	tts_pipeline.synthesizer,
	text,
	language_name="en",
	speaker=None,
	gpt_cond_latent=gpt_cond_latent,
	speaker_embedding=speaker_embedding,
	speed=1.1,
	# file_path="out.wav",
	)
	return (22050, np.array(out)), dict(text=text, voice=voice)