tensorspeech
/

tts-tacotron2-ljspeech-en

Text-to-Speech

TensorFlowTTS

English

audio

text-to-mel

Model card Files Files and versions Community

dathudeptrai commited on May 14, 2021

Commit

6482e82

•

1 Parent(s): cd8a09b

😘 Update config file.

Browse files

Files changed (1) hide show

config.yml +85 -58

config.yml CHANGED Viewed

@@ -1,59 +1,86 @@
-allow_cache: true
-batch_size: 32
-config: egs/ljspeech/conf/tacotron2.v1.yaml
-dev_dir: ./egs/ljspeech/dump/valid/
-end_ratio_value: 0.0
-eval_interval_steps: 500
-format: npy
-hop_size: 256
-is_shuffle: true
-log_interval_steps: 100
-mel_length_threshold: 32
-mixed_precision: false
-num_save_intermediate_results: 1
-optimizer_params:
-  decay_steps: 150000
-  end_learning_rate: 1.0e-05
-  initial_learning_rate: 0.001
-  warmup_proportion: 0.02
-  weight_decay: 0.001
-outdir: ./egs/ljspeech/exp/tacotron2.v1
-remove_short_samples: true
-resume: ./egs/ljspeech/exp/tacotron2.v1/checkpoints/ckpt-45000
-save_interval_steps: 5000
-schedule_decay_steps: 50000
-start_ratio_value: 0.5
-start_schedule_teacher_forcing: 250000
 tacotron2_params:
-  attention_dim: 128
-  attention_filters: 32
-  attention_kernel: 31
-  attention_type: lsa
-  decoder_lstm_units: 1024
-  embedding_dropout_prob: 0.1
-  embedding_hidden_size: 512
-  encoder_conv_activation: relu
-  encoder_conv_dropout_rate: 0.5
-  encoder_conv_filters: 512
-  encoder_conv_kernel_sizes: 5
-  encoder_lstm_units: 256
-  initializer_range: 0.02
-  n_conv_encoder: 5
-  n_conv_postnet: 5
-  n_lstm_decoder: 1
-  n_mels: 80
-  n_prenet_layers: 2
-  n_speakers: 1
-  postnet_conv_filters: 512
-  postnet_conv_kernel_sizes: 5
-  postnet_dropout_rate: 0.1
-  prenet_activation: relu
-  prenet_dropout_rate: 0.5
-  prenet_units: 256
-  reduction_factor: 1
-train_dir: ./egs/ljspeech/dump/train/
-train_max_steps: 200000
-use_fixed_shapes: true
-use_norm: true
-verbose: 1
-version: 0.3.4

+# This is the hyperparameter configuration file for Tacotron2 v1.
+# Please make sure this is adjusted for the LJSpeech dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration performs 200k iters but 65k iters is enough to get a good models.
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+hop_size: 256            # Hop size.
+format: "npy"
+###########################################################
+#              NETWORK ARCHITECTURE SETTING               #
+###########################################################
+model_type: "tacotron2"
 tacotron2_params:
+    dataset: ljspeech
+    embedding_hidden_size: 512
+    initializer_range: 0.02
+    embedding_dropout_prob: 0.1
+    n_speakers: 1
+    n_conv_encoder: 5
+    encoder_conv_filters: 512
+    encoder_conv_kernel_sizes: 5
+    encoder_conv_activation: 'relu'
+    encoder_conv_dropout_rate: 0.5
+    encoder_lstm_units: 256
+    n_prenet_layers: 2
+    prenet_units: 256
+    prenet_activation: 'relu'
+    prenet_dropout_rate: 0.5
+    n_lstm_decoder: 1
+    reduction_factor: 1
+    decoder_lstm_units: 1024
+    attention_dim: 128
+    attention_filters: 32
+    attention_kernel: 31
+    n_mels: 80
+    n_conv_postnet: 5
+    postnet_conv_filters: 512
+    postnet_conv_kernel_sizes: 5
+    postnet_dropout_rate: 0.1
+    attention_type: "lsa"
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 32             # Batch size for each GPU with assuming that gradient_accumulation_steps == 1.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+mel_length_threshold: 32   # remove all targets has mel_length <= 32
+is_shuffle: true           # shuffle dataset after each epoch.
+use_fixed_shapes: true     # use_fixed_shapes for training (2x speed-up)
+                           # refer (https://github.com/dathudeptrai/TensorflowTTS/issues/34#issuecomment-642309118)
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+optimizer_params:
+    initial_learning_rate: 0.001
+    end_learning_rate: 0.00001
+    decay_steps: 150000          # < train_max_steps is recommend.
+    warmup_proportion: 0.02
+    weight_decay: 0.001
+gradient_accumulation_steps: 1
+var_train_expr: null  # trainable variable expr (eg. 'embeddings|decoder_cell' )
+                      # must separate by |. if var_train_expr is null then we
+                      # training all variables.
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+train_max_steps: 200000                 # Number of training steps.
+save_interval_steps: 2000               # Interval steps to save checkpoint.
+eval_interval_steps: 500                # Interval steps to evaluate the network.
+log_interval_steps: 200                 # Interval steps to record the training log.
+start_schedule_teacher_forcing: 200001  # don't need to apply schedule teacher forcing.
+start_ratio_value: 0.5                  # start ratio of scheduled teacher forcing.
+schedule_decay_steps: 50000             # decay step scheduled teacher forcing.
+end_ratio_value: 0.0                    # end ratio of scheduled teacher forcing.
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 1  # Number of results to be saved as intermediate results.