# Generated 2024-03-06 from: # /home/marconilab/tacotron2/hparams/train.yaml # yamllint disable ############################################################################ # Model: Tacotron2 # Tokens: Raw characters (English text) # losses: Transducer # Training: LJSpeech # Authors: Georges Abous-Rjeili, Artem Ploujnikov, Yingzhi Wang # ############################################################################ ################################### # Experiment Parameters and setup # ################################### seed: 1234 __set_seed: !apply:torch.manual_seed [1234] output_folder: ./results/tacotron2/1234 save_folder: ./results/tacotron2/1234/save train_log: ./results/tacotron2/1234/train_log.txt epochs: 500 keep_checkpoint_interval: 50 wandb_id: tacotron2-luganda wandb_user: sulaiman-kagumire wandb_project: tts-luganda init_from_pretrained: true ################################### # Progress Samples # ################################### # Progress samples are used to monitor the progress # of an ongoing training session by outputting samples # of spectrograms, alignments, etc at regular intervals # Whether to enable progress samples progress_samples: false # The path where the samples will be stored progress_sample_path: ./results/tacotron2/1234/samples # The interval, in epochs. For instance, if it is set to 5, # progress samples will be output every 5 epochs progress_samples_interval: 1 # The sample size for raw batch samples saved in batch.pth # (useful mostly for model debugging) progress_batch_sample_size: 3 ################################# # Data files and pre-processing # ################################# data_folder: data_folder # e.g, /localscratch/ljspeech train_json: ./results/tacotron2/1234/save/train.json valid_json: ./results/tacotron2/1234/save/valid.json test_json: ./results/tacotron2/1234/save/test.json splits: [train, valid, test] split_ratio: [80, 10, 10] skip_prep: false # Use the original preprocessing from nvidia # The cleaners to be used (applicable to nvidia only) text_cleaners: [basic_cleaners] ################################ # Audio Parameters # ################################ sample_rate: 22050 hop_length: 256 win_length: 1024 n_mel_channels: 80 n_fft: 1024 mel_fmin: 0.0 mel_fmax: 8000.0 mel_normalized: false power: 1 norm: slaney mel_scale: slaney dynamic_range_compression: true ################################ # Optimization Hyperparameters # ################################ learning_rate: 0.001 weight_decay: 0.000006 batch_size: 256 num_workers: 8 mask_padding: true guided_attention_sigma: 0.2 guided_attention_weight: 50.0 guided_attention_weight_half_life: 10. guided_attention_hard_stop: 50 gate_loss_weight: 1.0 train_dataloader_opts: batch_size: 256 drop_last: false #True #False num_workers: 8 collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate valid_dataloader_opts: batch_size: 256 num_workers: 8 collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate test_dataloader_opts: batch_size: 256 num_workers: 8 collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate ################################ # Model Parameters and model # ################################ n_symbols: 148 #fixed depending on symbols in textToSequence symbols_embedding_dim: 512 # Encoder parameters encoder_kernel_size: 5 encoder_n_convolutions: 3 encoder_embedding_dim: 512 # Decoder parameters # The number of frames in the target per encoder step n_frames_per_step: 1 decoder_rnn_dim: 1024 prenet_dim: 256 max_decoder_steps: 1000 gate_threshold: 0.5 p_attention_dropout: 0.1 p_decoder_dropout: 0.1 decoder_no_early_stopping: false # Attention parameters attention_rnn_dim: 1024 attention_dim: 128 # Location Layer parameters attention_location_n_filters: 32 attention_location_kernel_size: 31 # Mel-post processing network parameters postnet_embedding_dim: 512 postnet_kernel_size: 5 postnet_n_convolutions: 5 mel_spectogram: !name:speechbrain.lobes.models.Tacotron2.mel_spectogram sample_rate: 22050 hop_length: 256 win_length: 1024 n_fft: 1024 n_mels: 80 f_min: 0.0 f_max: 8000.0 power: 1 normalized: false norm: slaney mel_scale: slaney compression: true #model model: &id002 !new:speechbrain.lobes.models.Tacotron2.Tacotron2 #optimizer mask_padding: true n_mel_channels: 80 # symbols n_symbols: 148 symbols_embedding_dim: 512 # encoder encoder_kernel_size: 5 encoder_n_convolutions: 3 encoder_embedding_dim: 512 # attention attention_rnn_dim: 1024 attention_dim: 128 # attention location attention_location_n_filters: 32 attention_location_kernel_size: 31 # decoder n_frames_per_step: 1 decoder_rnn_dim: 1024 prenet_dim: 256 max_decoder_steps: 1000 gate_threshold: 0.5 p_attention_dropout: 0.1 p_decoder_dropout: 0.1 # postnet postnet_embedding_dim: 512 postnet_kernel_size: 5 postnet_n_convolutions: 5 decoder_no_early_stopping: false guided_attention_scheduler: &id001 !new:speechbrain.nnet.schedulers.StepScheduler initial_value: 50.0 half_life: 10. criterion: !new:speechbrain.lobes.models.Tacotron2.Loss gate_loss_weight: 1.0 guided_attention_weight: 50.0 guided_attention_sigma: 0.2 guided_attention_scheduler: *id001 guided_attention_hard_stop: 50 modules: model: *id002 opt_class: !name:torch.optim.Adam lr: 0.001 weight_decay: 0.000006 #epoch object epoch_counter: &id003 !new:speechbrain.utils.epoch_loop.EpochCounter limit: 500 # train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger # save_file: !ref train_logger: !new:speechbrain.utils.train_logger.WandBLogger initializer: !name:wandb.init # id: !ref name: tacotron2-luganda entity: sulaiman-kagumire project: tts-luganda reinit: true # yaml_config: hparams/train.yaml resume: allow #annealing_function lr_annealing: &id004 !new:speechbrain.nnet.schedulers.IntervalScheduler #infer: !name:speechbrain.lobes.models.Tacotron2.infer intervals: - steps: 6000 lr: 0.0005 - steps: 8000 lr: 0.0003 - steps: 10000 lr: 0.0001 #checkpointer checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: ./results/tacotron2/1234/save recoverables: model: *id002 counter: *id003 scheduler: *id004 progress_sample_logger: !new:speechbrain.utils.train_logger.ProgressSampleLogger output_path: ./results/tacotron2/1234/samples batch_sample_size: 3 formats: raw_batch: raw