# Generated 2022-10-03 from: # /netscratch/sagar/thesis/speechbrain/recipes/CommonVoice_de/ASR-Libri/seq2seq/hparams/train.yaml # yamllint disable # ############################################################################ # Model: E2E ASR with attention-based ASR # Encoder: CRDNN model # Decoder: GRU + beamsearch + RNNLM # Tokens: BPE with unigram # losses: CTC+ NLL # Training: Librispeech 960h # Authors: Ju-Chieh Chou, Mirco Ravanelli, Abdel Heba, Peter Plantinga, # Samuele Cornell 2020 # ############################################################################ # Seed needs to be set at top of yaml, before objects with parameters seed: 1200 __set_seed: !apply:torch.manual_seed [1200] output_folder: results/CRDNN_BPE_960h_LM/1200 wer_file: results/CRDNN_BPE_960h_LM/1200/wer.txt save_folder: results/CRDNN_BPE_960h_LM/1200/save train_log: results/CRDNN_BPE_960h_LM/1200/train_log.txt # Language model (LM) pretraining # NB: To avoid mismatch, the speech recognizer must be trained with the same # tokenizer used for LM training. Here, we download everything from the # speechbrain HuggingFace repository. However, a local path pointing to a # directory containing the lm.ckpt and tokenizer.ckpt may also be specified # instead. E.g if you want to use your own LM / tokenizer. # We have bos/eos id 0/0 so we use the same tokenizer and LM that uses bos id and eos id as 0/0. pretrained_tokenizer_path: ../../Tokenizer/results/unigram/ pretrained_lm_path: ../../LM/results/RNN/2995/save/CKPT+2022-08-18+18-22-18+00 # Data files data_folder: ../../CommonVoice # !PLACEHOLDER # e,g./path/to/LibriSpeech # noise/ris dataset will automatically be downloaded # Data files train_tsv_file: ../../CommonVoice/train.tsv # Standard CommonVoice .tsv files dev_tsv_file: ../../CommonVoice/dev.tsv # Standard CommonVoice .tsv files test_tsv_file: ../../CommonVoice/test.tsv # Standard CommonVoice .tsv files accented_letters: true language: de ckpt_interval_minutes: 15 # save checkpoint every N min csv_dir: ../../cv_de_acc data_folder_rirs: ../../cv_de_acc # where to store noisy data for augment (change it if needed) train_csv: ../../cv_de_acc/train.csv valid_csv: ../../cv_de_acc/dev.csv test_csv: ../../cv_de_acc/test.csv skip_prep: false # Training parameters number_of_epochs: 25 number_of_ctc_epochs: 5 batch_size: 8 valid_batch_size: 8 test_batch_size: 8 lr: 1.0 ctc_weight: 0.5 sorting: ascending dynamic_batching: false # dynamic batching parameters, if used dynamic_batch_sampler: feats_hop_size: 0.01 max_batch_len: 20000 # in terms of frames shuffle_ex: true batch_ordering: random num_buckets: 20 # Feature parameters sample_rate: 16000 n_fft: 400 n_mels: 40 opt_class: !name:torch.optim.Adadelta lr: 1.0 rho: 0.95 eps: 1.e-8 # Dataloader options train_dataloader_opts: batch_size: 8 valid_dataloader_opts: batch_size: 8 test_dataloader_opts: batch_size: 8 # Model parameters activation: &id001 !name:torch.nn.LeakyReLU dropout: 0.15 cnn_blocks: 2 cnn_channels: (128, 256) inter_layer_pooling_size: (2, 2) cnn_kernelsize: (3, 3) time_pooling_size: 4 rnn_class: &id002 !name:speechbrain.nnet.RNN.LSTM rnn_layers: 4 rnn_neurons: 1024 rnn_bidirectional: true dnn_blocks: 2 dnn_neurons: 512 emb_size: 128 dec_neurons: 1024 output_neurons: 1000 # Number of tokens (same as LM) blank_index: 0 bos_index: 0 eos_index: 0 # Decoding parameters min_decode_ratio: 0.0 max_decode_ratio: 1.0 valid_beam_size: 80 test_beam_size: 80 eos_threshold: 1.5 using_max_attn_shift: true max_attn_shift: 240 lm_weight: 0.50 ctc_weight_decode: 0.0 coverage_penalty: 1.5 temperature: 1.25 temperature_lm: 1.25 epoch_counter: &id013 !new:speechbrain.utils.epoch_loop.EpochCounter limit: 25 normalize: &id008 !new:speechbrain.processing.features.InputNormalization norm_type: global compute_features: !new:speechbrain.lobes.features.Fbank sample_rate: 16000 n_fft: 400 n_mels: 40 env_corrupt: &id009 !new:speechbrain.lobes.augment.EnvCorrupt openrir_folder: ../../cv_de_acc babble_prob: 0.0 reverb_prob: 0.0 noise_prob: 1.0 noise_snr_low: 0 noise_snr_high: 15 augmentation: !new:speechbrain.lobes.augment.TimeDomainSpecAugment sample_rate: 16000 speeds: [95, 100, 105] enc: &id003 !new:speechbrain.lobes.models.CRDNN.CRDNN input_shape: [null, null, 40] activation: *id001 dropout: 0.15 cnn_blocks: 2 cnn_channels: (128, 256) cnn_kernelsize: (3, 3) inter_layer_pooling_size: (2, 2) time_pooling: true using_2d_pooling: false time_pooling_size: 4 rnn_class: *id002 rnn_layers: 4 rnn_neurons: 1024 rnn_bidirectional: true rnn_re_init: true dnn_blocks: 2 dnn_neurons: 512 use_rnnp: false emb: &id004 !new:speechbrain.nnet.embedding.Embedding num_embeddings: 1000 embedding_dim: 128 dec: &id005 !new:speechbrain.nnet.RNN.AttentionalRNNDecoder enc_dim: 512 input_size: 128 rnn_type: gru attn_type: location hidden_size: 1024 attn_dim: 1024 num_layers: 1 scaling: 1.0 channels: 10 kernel_size: 100 re_init: true dropout: 0.15 ctc_lin: &id006 !new:speechbrain.nnet.linear.Linear input_size: 512 n_neurons: 1000 seq_lin: &id007 !new:speechbrain.nnet.linear.Linear input_size: 1024 n_neurons: 1000 log_softmax: !new:speechbrain.nnet.activations.Softmax apply_log: true ctc_cost: !name:speechbrain.nnet.losses.ctc_loss blank_index: 0 seq_cost: !name:speechbrain.nnet.losses.nll_loss label_smoothing: 0.1 # This is the RNNLM that is used according to the Huggingface repository # NB: It has to match the pre-trained RNNLM!! lm_model: &id010 !new:speechbrain.lobes.models.RNNLM.RNNLM output_neurons: 1000 embedding_dim: 128 activation: !name:torch.nn.LeakyReLU dropout: 0.0 rnn_layers: 2 rnn_neurons: 2048 dnn_blocks: 1 dnn_neurons: 512 return_hidden: true # For inference tokenizer: &id014 !new:sentencepiece.SentencePieceProcessor # Models modules: enc: *id003 emb: *id004 dec: *id005 ctc_lin: *id006 seq_lin: *id007 normalize: *id008 env_corrupt: *id009 lm_model: *id010 model: &id011 !new:torch.nn.ModuleList - [*id003, *id004, *id005, *id006, *id007] valid_search: !new:speechbrain.decoders.S2SRNNBeamSearcher embedding: *id004 decoder: *id005 linear: *id007 ctc_linear: *id006 bos_index: 0 eos_index: 0 blank_index: 0 min_decode_ratio: 0.0 max_decode_ratio: 1.0 beam_size: 80 eos_threshold: 1.5 using_max_attn_shift: true max_attn_shift: 240 coverage_penalty: 1.5 temperature: 1.25 test_search: !new:speechbrain.decoders.S2SRNNBeamSearchLM embedding: *id004 decoder: *id005 linear: *id007 ctc_linear: *id006 language_model: *id010 bos_index: 0 eos_index: 0 blank_index: 0 min_decode_ratio: 0.0 max_decode_ratio: 1.0 beam_size: 80 eos_threshold: 1.5 using_max_attn_shift: true max_attn_shift: 240 coverage_penalty: 1.5 lm_weight: 0.50 ctc_weight: 0.0 temperature: 1.25 temperature_lm: 1.25 lr_annealing: &id012 !new:speechbrain.nnet.schedulers.NewBobScheduler initial_value: 1.0 improvement_threshold: 0.0025 annealing_factor: 0.8 patient: 0 checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: results/CRDNN_BPE_960h_LM/1200/save recoverables: model: *id011 scheduler: *id012 normalizer: *id008 counter: *id013 train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: results/CRDNN_BPE_960h_LM/1200/train_log.txt error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats split_tokens: true # The pretrainer allows a mapping between pretrained files and instances that # are declared in the yaml. E.g here, we will download the file lm.ckpt # and it will be loaded into "lm" which is pointing to the defined # before. pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer collect_in: results/CRDNN_BPE_960h_LM/1200/save loadables: lm: *id010 tokenizer: *id014 paths: lm: ../../LM/results/RNN/2995/save/CKPT+2022-08-18+18-22-18+00/model.ckpt tokenizer: ../../Tokenizer/results/unigram//1000_unigram.model