# Seed needs to be set at top of yaml, before objects with parameters are made seed: 1986 __set_seed: !apply:torch.manual_seed [!ref ] lang_csv: Swahili output_folder: !ref results/finetune_hubert_ASR_char// output_wer_folder: !ref / save_folder: !ref /save train_log: !ref /train_log.txt # huggingface format hubert_hub: Orange/SSA-HuBERT-base-60k hubert_folder: !ref /hubert_checkpoint # Data files data_folder: !ref PATH_TO_YOUR_FOLDER/data_speechbrain/ ckpt_interval_minutes: 10 # save checkpoint every N min train_csv: !ref /train.csv valid_csv: !ref /validation.csv test_csv: - !ref /test.csv ####################### Training Parameters #################################### number_of_epochs: 10 lr: 0.1 lr_hubert: 0.000005 sorting: ascending precision: fp32 # bf16, fp16 or fp32 sample_rate: 16000 # skip audio file longer than avoid_if_longer_than: 60 batch_size: 2 test_batch_size: 2 # Dataloader options train_dataloader_opts: batch_size: !ref valid_dataloader_opts: batch_size: !ref test_dataloader_opts: batch_size: !ref ####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dnn_layers: 2 dnn_neurons: 1024 freeze_hubert: False # Outputs output_neurons: 66 # BPE size, index(blank/eos/bos) = 0 blank_index: 0 # # Functions and classes # label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT source: !ref output_norm: True freeze: !ref save_path: !ref top_lin: !new:speechbrain.lobes.models.VanillaNN.VanillaNN input_shape: [null, null, 768] # 768 == output of hubert base model activation: !ref dnn_blocks: !ref dnn_neurons: !ref ctc_lin: !new:speechbrain.nnet.linear.Linear input_size: !ref n_neurons: !ref log_softmax: !new:speechbrain.nnet.activations.Softmax apply_log: True ctc_cost: !name:speechbrain.nnet.losses.ctc_loss blank_index: !ref modules: hubert: !ref top_lin: !ref ctc_lin: !ref model: !new:torch.nn.ModuleList - [!ref , !ref ] model_opt_class: !name:torch.optim.Adadelta lr: !ref rho: 0.95 eps: 1.e-8 hubert_opt_class: !name:torch.optim.Adam lr: !ref lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler initial_value: !ref improvement_threshold: 0.0025 annealing_factor: 0.8 patient: 0 lr_annealing_hubert: !new:speechbrain.nnet.schedulers.NewBobScheduler initial_value: !ref improvement_threshold: 0.0025 annealing_factor: 0.9 patient: 0 ############################## Augmentations ################################### # Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. drop_freq: !new:speechbrain.augment.time_domain.DropFreq drop_freq_low: 0 drop_freq_high: 1 drop_freq_count_low: 1 drop_freq_count_high: 3 drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. drop_chunk: !new:speechbrain.augment.time_domain.DropChunk drop_length_low: 1000 drop_length_high: 2000 drop_count_low: 1 drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter concat_original: True min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 augmentations: [ !ref , !ref , !ref ] ############################## Decoding ######################################## # Decoding parameters test_beam_search: beam_size: 143 topk: 1 blank_index: !ref space_token: ' ' # make sure this is the same as the one used in the tokenizer beam_prune_logp: -12.0 token_prune_min_logp: -1.20 prune_history: True alpha: 0.8 beta: 1.2 # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM # It can either be a .bin or .arpa ; note: .arpa is much slower at loading # If you don't want to use an LM, comment it out or set it to null kenlm_model_path: null ############################## Logging and Pretrainer ########################## checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref recoverables: hubert: !ref model: !ref scheduler_model: !ref scheduler_hubert: !ref counter: !ref tokenizer: !ref train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats split_tokens: True