# network architecture # encoder related encoder: conformer encoder_conf: output_size: 512 # dimension of attention attention_heads: 8 linear_units: 2048 # the number of units of position-wise feed forward num_blocks: 12 # the number of encoder blocks dropout_rate: 0.1 positional_dropout_rate: 0.1 attention_dropout_rate: 0.1 input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 normalize_before: true cnn_module_kernel: 31 use_cnn_module: true activation_type: 'swish' pos_enc_layer_type: 'rel_pos' selfattention_layer_type: 'rel_selfattn' joint_conf: join_dim: 512 prejoin_linear: True postjoin_linear: false joint_mode: 'add' activation: 'tanh' predictor: rnn predictor_conf: embed_size: 512 output_size: 512 embed_dropout: 0.1 hidden_size: 512 num_layers: 2 bias: true rnn_type: 'lstm' dropout: 0.1 decoder: bitransformer decoder_conf: attention_heads: 8 dropout_rate: 0.1 linear_units: 2048 num_blocks: 3 positional_dropout_rate: 0.1 r_num_blocks: 3 self_attention_dropout_rate: 0.1 src_attention_dropout_rate: 0.1 # hybrid transducer+ctc+attention model_conf: transducer_weight: 0.75 ctc_weight: 0.1 attention_weight: 0.15 lsm_weight: 0.1 # label smoothing option length_normalized_loss: false reverse_weight: 0.3 dataset_conf: filter_conf: max_length: 1650 min_length: 10 token_max_length: 200 token_min_length: 1 resample_conf: resample_rate: 16000 speed_perturb: true fbank_conf: num_mel_bins: 80 frame_shift: 10 frame_length: 25 dither: 0.1 spec_aug: true spec_aug_conf: num_t_mask: 2 num_f_mask: 2 max_t: 50 max_f: 10 shuffle: true shuffle_conf: shuffle_size: 1500 sort: true sort_conf: sort_size: 500 # sort_size should be less than shuffle_size batch_conf: batch_type: 'dynamic' # static or dynamic max_frames_in_batch: 4000 grad_clip: 4 accum_grad: 1 max_epoch: 140 log_interval: 100 optim: adam optim_conf: lr: 0.001 scheduler: warmuplr # pytorch v1.1.0+ required scheduler_conf: warmup_steps: 25000