Spaces:

Shadhil
/

Voice-Clone

Running

App Files Files Community

Voice-Clone / TTS /tts /configs /neuralhmm_tts_config.py

Shadhil

voice-clone with single audio sample input

9b2107c 8 months ago

raw

history blame

No virus

7.91 kB

	from dataclasses import dataclass, field
	from typing import List

	from TTS.tts.configs.shared_configs import BaseTTSConfig


	@dataclass
	class NeuralhmmTTSConfig(BaseTTSConfig):
	"""
	Define parameters for Neural HMM TTS model.

	Example:

	>>> from TTS.tts.configs.overflow_config import OverflowConfig
	>>> config = OverflowConfig()

	Args:
	model (str):
	Model name used to select the right model class to initilize. Defaults to `Overflow`.
	run_eval_steps (int):
	Run evalulation epoch after N steps. If None, waits until training epoch is completed. Defaults to None.
	save_step (int):
	Save local checkpoint every save_step steps. Defaults to 500.
	plot_step (int):
	Plot training stats on the logger every plot_step steps. Defaults to 1.
	model_param_stats (bool):
	Log model parameters stats on the logger dashboard. Defaults to False.
	force_generate_statistics (bool):
	Force generate mel normalization statistics. Defaults to False.
	mel_statistics_parameter_path (str):
	Path to the mel normalization statistics.If the model doesn't finds a file there it will generate statistics.
	Defaults to None.
	num_chars (int):
	Number of characters used by the model. It must be defined before initializing the model. Defaults to None.
	state_per_phone (int):
	Generates N states per phone. Similar, to `add_blank` parameter in GlowTTS but in Overflow it is upsampled by model's encoder. Defaults to 2.
	encoder_in_out_features (int):
	Channels of encoder input and character embedding tensors. Defaults to 512.
	encoder_n_convolutions (int):
	Number of convolution layers in the encoder. Defaults to 3.
	out_channels (int):
	Channels of the final model output. It must match the spectragram size. Defaults to 80.
	ar_order (int):
	Autoregressive order of the model. Defaults to 1. In ablations of Neural HMM it was found that more autoregression while giving more variation hurts naturalness of the synthesised audio.
	sampling_temp (float):
	Variation added to the sample from the latent space of neural HMM. Defaults to 0.334.
	deterministic_transition (bool):
	deterministic duration generation based on duration quantiles as defiend in "S. Ronanki, O. Watts, S. King, and G. E. Henter, “Medianbased generation of synthetic speech durations using a nonparametric approach,” in Proc. SLT, 2016.". Defaults to True.
	duration_threshold (float):
	Threshold for duration quantiles. Defaults to 0.55. Tune this to change the speaking rate of the synthesis, where lower values defines a slower speaking rate and higher values defines a faster speaking rate.
	use_grad_checkpointing (bool):
	Use gradient checkpointing to save memory. In a multi-GPU setting currently pytorch does not supports gradient checkpoint inside a loop so we will have to turn it off then.Adjust depending on whatever get more batch size either by using a single GPU or multi-GPU. Defaults to True.
	max_sampling_time (int):
	Maximum sampling time while synthesising latents from neural HMM. Defaults to 1000.
	prenet_type (str):
	`original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the
	Prenet. Defaults to `original`.
	prenet_dim (int):
	Dimension of the Prenet. Defaults to 256.
	prenet_n_layers (int):
	Number of layers in the Prenet. Defaults to 2.
	prenet_dropout (float):
	Dropout rate of the Prenet. Defaults to 0.5.
	prenet_dropout_at_inference (bool):
	Use dropout at inference time. Defaults to False.
	memory_rnn_dim (int):
	Dimension of the memory LSTM to process the prenet output. Defaults to 1024.
	outputnet_size (list[int]):
	Size of the output network inside the neural HMM. Defaults to [1024].
	flat_start_params (dict):
	Parameters for the flat start initialization of the neural HMM. Defaults to `{"mean": 0.0, "std": 1.0, "transition_p": 0.14}`.
	It will be recomputed when you pass the dataset.
	std_floor (float):
	Floor value for the standard deviation of the neural HMM. Prevents model cheating by putting point mass and getting infinite likelihood at any datapoint. Defaults to 0.01.
	It is called `variance flooring` in standard HMM literature.
	optimizer (str):
	Optimizer to use for training. Defaults to `adam`.
	optimizer_params (dict):
	Parameters for the optimizer. Defaults to `{"weight_decay": 1e-6}`.
	grad_clip (float):
	Gradient clipping threshold. Defaults to 40_000.
	lr (float):
	Learning rate. Defaults to 1e-3.
	lr_scheduler (str):
	Learning rate scheduler for the training. Use one from `torch.optim.Scheduler` schedulers or
	`TTS.utils.training`. Defaults to `None`.
	min_seq_len (int):
	Minimum input sequence length to be used at training.
	max_seq_len (int):
	Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
	"""

	model: str = "NeuralHMM_TTS"

	# Training and Checkpoint configs
	run_eval_steps: int = 100
	save_step: int = 500
	plot_step: int = 1
	model_param_stats: bool = False

	# data parameters
	force_generate_statistics: bool = False
	mel_statistics_parameter_path: str = None

	# Encoder parameters
	num_chars: int = None
	state_per_phone: int = 2
	encoder_in_out_features: int = 512
	encoder_n_convolutions: int = 3

	# HMM parameters
	out_channels: int = 80
	ar_order: int = 1
	sampling_temp: float = 0
	deterministic_transition: bool = True
	duration_threshold: float = 0.43
	use_grad_checkpointing: bool = True
	max_sampling_time: int = 1000

	## Prenet parameters
	prenet_type: str = "original"
	prenet_dim: int = 256
	prenet_n_layers: int = 2
	prenet_dropout: float = 0.5
	prenet_dropout_at_inference: bool = True
	memory_rnn_dim: int = 1024

	## Outputnet parameters
	outputnet_size: List[int] = field(default_factory=lambda: [1024])
	flat_start_params: dict = field(default_factory=lambda: {"mean": 0.0, "std": 1.0, "transition_p": 0.14})
	std_floor: float = 0.001

	# optimizer parameters
	optimizer: str = "Adam"
	optimizer_params: dict = field(default_factory=lambda: {"weight_decay": 1e-6})
	grad_clip: float = 40000.0
	lr: float = 1e-3
	lr_scheduler: str = None

	# overrides
	min_text_len: int = 10
	max_text_len: int = 500
	min_audio_len: int = 512

	# testing
	test_sentences: List[str] = field(
	default_factory=lambda: [
	"Be a voice, not an echo.",
	]
	)

	# Extra needed config
	r: int = 1
	use_d_vector_file: bool = False
	use_speaker_embedding: bool = False

	def check_values(self):
	"""Validate the hyperparameters.

	Raises:
	AssertionError: when the parameters network is not defined
	AssertionError: transition probability is not between 0 and 1
	"""
	assert self.ar_order > 0, "AR order must be greater than 0 it is an autoregressive model."
	assert (
	len(self.outputnet_size) >= 1
	), f"Parameter Network must have atleast one layer check the config file for parameter network. Provided: {self.parameternetwork}"
	assert (
	0 < self.flat_start_params["transition_p"] < 1
	), f"Transition probability must be between 0 and 1. Provided: {self.flat_start_params['transition_p']}"