############################################################################ # Model: TransformerTTS # Training: LJSpeech # Author: Kasturi Saha # ############################################################################ ################################### # Experiment Parameters and setup # ################################### seed: 1986 __set_seed: !apply:torch.manual_seed [!ref ] output_folder: !ref ./results/transformerTTS/ save_folder: !ref /save train_log: !ref /train_log.txt epochs: 5 keep_checkpoint_interval: 50 ################################### # Progress Samples # ################################### # Progress samples are used to monitor the progress # of an ongoing training session by outputting samples # of spectrograms, alignments, etc at regular intervals # Whether to enable progress samples progress_samples: False # The path where the samples will be stored progress_sample_path: !ref /samples # The interval, in epochs. For instance, if it is set to 5, # progress samples will be output every 5 epochs progress_samples_interval: 1 # The sample size for raw batch samples saved in batch.pth # (useful mostly for model debugging) progress_batch_sample_size: 3 ################################# # Data files and pre-processing # ################################# data_folder: !ref ./data/LJSpeech-1.1 # e.g, /localscratch/ljspeech preprocessed_data_folder: !ref ./data/LJSpeech-1.1/preprocessed/phone_seq # e.g, /localscratch/ljspeech preprocessed_melspectrogram_folder: !ref ./data/LJSpeech-1.1/preprocessed/melspectrogram # e.g, /localscratch/ljspeech train_json: !ref ./save/train.json valid_json: !ref ./save/valid.json test_json: !ref ./save/test.json splits: ["train", "valid", "test"] split_ratio: [70, 10, 20] skip_prep: False ################################ # Audio Parameters # ################################ sample_rate: 22050 hop_length: 256 win_length: 1024 n_mel_channels: 80 n_fft: 1024 mel_fmin: 0.0 mel_fmax: 8000.0 mel_normalized: False power: 1.2 norm: "slaney" mel_scale: "slaney" dynamic_range_compression: True ################################ # Optimization Hyperparameters # ################################ learning_rate: 0.001 weight_decay: 0.000006 batch_size: 8 #minimum 2 num_workers: 0 mask_padding: True train_dataloader_opts: batch_size: !ref drop_last: False #True #False num_workers: !ref collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate valid_dataloader_opts: batch_size: !ref num_workers: !ref collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate test_dataloader_opts: batch_size: !ref num_workers: !ref collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate ################################ # Model Parameters and model # ################################ n_symbols: 148 #fixed depending on symbols in textToSequence symbols_embedding_dim: 512 hidden_dim: 256 eprenet_dim: 512 n_prenet_layers: 3 dprenet_dim: 256 postnet_dim: 256 ff_dim: 1024 n_heads: 8 n_layers: 6 n_postnet_layers: 5 # Decoder parameters # The number of frames in the target per encoder step n_frames_per_step: 1 decoder_rnn_dim: 1024 prenet_dim: 256 max_decoder_steps: 1000 gate_threshold: 0.5 p_attention_dropout: 0.1 p_decoder_dropout: 0.1 decoder_no_early_stopping: False # Attention parameters attention_rnn_dim: 1024 attention_dim: 128 # Location Layer parameters attention_location_n_filters: 32 attention_location_kernel_size: 31 # Mel-post processing network parameters postnet_embedding_dim: 256 postnet_kernel_size: 5 postnet_n_convolutions: 5 #model model: !new:TransformerTTS.TransformerTTS n_mel_channels: !ref # symbols n_symbols: !ref symbols_embedding_dim: !ref eprenet_dim: 512 n_prenet_layers: 3 # decoder dprenet_dim: !ref # postnet postnet_dim: !ref hidden_dim: !ref n_postnet_layers: !ref nhead: !ref guided_attention_sigma: 0.2 guided_attention_weight: 50.0 guided_attention_weight_half_life: 10. guided_attention_hard_stop: 50 gate_loss_weight: 1.0 guided_attention_scheduler: !new:speechbrain.nnet.schedulers.StepScheduler initial_value: !ref half_life: !ref criterion: !new:TransformerTTS.Loss gate_loss_weight: !ref guided_attention_weight: !ref guided_attention_sigma: !ref guided_attention_scheduler: !ref guided_attention_hard_stop: !ref modules: model: !ref #optimizer opt_class: !name:torch.optim.Adam lr: !ref weight_decay: !ref #epoch object epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref #annealing_function lr_annealing: !new:speechbrain.nnet.schedulers.IntervalScheduler intervals: - steps: 6000 lr: 0.0005 - steps: 8000 lr: 0.0003 - steps: 10000 lr: 0.0001 #checkpointer checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref recoverables: model: !ref counter: !ref scheduler: !ref progress_sample_logger: !new:speechbrain.utils.train_logger.ProgressSampleLogger output_path: !ref batch_sample_size: !ref formats: raw_batch: raw max_grad_norm: 1.0 pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer loadables: model: !ref