Spaces:
Build error
Build error
import ast | |
import pprint | |
import json | |
class HParams(object): | |
def __init__(self, **kwargs): self.__dict__.update(kwargs) | |
def __setitem__(self, key, value): setattr(self, key, value) | |
def __getitem__(self, key): return getattr(self, key) | |
def __repr__(self): return pprint.pformat(self.__dict__) | |
def parse(self, string): | |
# Overrides hparams from a comma-separated string of name=value pairs | |
if len(string) > 0: | |
overrides = [s.split("=") for s in string.split(",")] | |
keys, values = zip(*overrides) | |
keys = list(map(str.strip, keys)) | |
values = list(map(str.strip, values)) | |
for k in keys: | |
self.__dict__[k] = ast.literal_eval(values[keys.index(k)]) | |
return self | |
def loadJson(self, dict): | |
print("\Loading the json with %s\n", dict) | |
for k in dict.keys(): | |
if k not in ["tts_schedule", "tts_finetune_layers"]: | |
self.__dict__[k] = dict[k] | |
return self | |
def dumpJson(self, fp): | |
print("\Saving the json with %s\n", fp) | |
with fp.open("w", encoding="utf-8") as f: | |
json.dump(self.__dict__, f) | |
return self | |
hparams = HParams( | |
### Signal Processing (used in both synthesizer and vocoder) | |
sample_rate = 16000, | |
n_fft = 800, | |
num_mels = 80, | |
hop_size = 200, # Tacotron uses 12.5 ms frame shift (set to sample_rate * 0.0125) | |
win_size = 800, # Tacotron uses 50 ms frame length (set to sample_rate * 0.050) | |
fmin = 55, | |
min_level_db = -100, | |
ref_level_db = 20, | |
max_abs_value = 4., # Gradient explodes if too big, premature convergence if too small. | |
preemphasis = 0.97, # Filter coefficient to use if preemphasize is True | |
preemphasize = True, | |
### Tacotron Text-to-Speech (TTS) | |
tts_embed_dims = 512, # Embedding dimension for the graphemes/phoneme inputs | |
tts_encoder_dims = 256, | |
tts_decoder_dims = 128, | |
tts_postnet_dims = 512, | |
tts_encoder_K = 5, | |
tts_lstm_dims = 1024, | |
tts_postnet_K = 5, | |
tts_num_highways = 4, | |
tts_dropout = 0.5, | |
tts_cleaner_names = ["basic_cleaners"], | |
tts_stop_threshold = -3.4, # Value below which audio generation ends. | |
# For example, for a range of [-4, 4], this | |
# will terminate the sequence at the first | |
# frame that has all values < -3.4 | |
### Tacotron Training | |
tts_schedule = [(2, 1e-3, 10_000, 12), # Progressive training schedule | |
(2, 5e-4, 15_000, 12), # (r, lr, step, batch_size) | |
(2, 2e-4, 20_000, 12), # (r, lr, step, batch_size) | |
(2, 1e-4, 30_000, 12), # | |
(2, 5e-5, 40_000, 12), # | |
(2, 1e-5, 60_000, 12), # | |
(2, 5e-6, 160_000, 12), # r = reduction factor (# of mel frames | |
(2, 3e-6, 320_000, 12), # synthesized for each decoder iteration) | |
(2, 1e-6, 640_000, 12)], # lr = learning rate | |
tts_clip_grad_norm = 1.0, # clips the gradient norm to prevent explosion - set to None if not needed | |
tts_eval_interval = 500, # Number of steps between model evaluation (sample generation) | |
# Set to -1 to generate after completing epoch, or 0 to disable | |
tts_eval_num_samples = 1, # Makes this number of samples | |
## For finetune usage, if set, only selected layers will be trained, available: encoder,encoder_proj,gst,decoder,postnet,post_proj | |
tts_finetune_layers = [], | |
### Data Preprocessing | |
max_mel_frames = 900, | |
rescale = True, | |
rescaling_max = 0.9, | |
synthesis_batch_size = 16, # For vocoder preprocessing and inference. | |
### Mel Visualization and Griffin-Lim | |
signal_normalization = True, | |
power = 1.5, | |
griffin_lim_iters = 60, | |
### Audio processing options | |
fmax = 7600, # Should not exceed (sample_rate // 2) | |
allow_clipping_in_normalization = True, # Used when signal_normalization = True | |
clip_mels_length = True, # If true, discards samples exceeding max_mel_frames | |
use_lws = False, # "Fast spectrogram phase recovery using local weighted sums" | |
symmetric_mels = True, # Sets mel range to [-max_abs_value, max_abs_value] if True, | |
# and [0, max_abs_value] if False | |
trim_silence = True, # Use with sample_rate of 16000 for best results | |
### SV2TTS | |
speaker_embedding_size = 256, # Dimension for the speaker embedding | |
silence_min_duration_split = 0.4, # Duration in seconds of a silence for an utterance to be split | |
utterance_min_duration = 1.6, # Duration in seconds below which utterances are discarded | |
use_gst = True, # Whether to use global style token | |
use_ser_for_gst = True, # Whether to use speaker embedding referenced for global style token | |
) | |