# network architecture
# encoder related
encoder: conformer
encoder_conf:
    output_size: 512    # dimension of attention
    attention_heads: 8
    linear_units: 2048  # the number of units of position-wise feed forward
    num_blocks: 12      # the number of encoder blocks
    dropout_rate: 0.1
    positional_dropout_rate: 0.1
    attention_dropout_rate: 0.1
    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
    normalize_before: true
    cnn_module_kernel: 31
    use_cnn_module: true
    activation_type: 'swish'
    pos_enc_layer_type: 'rel_pos'
    selfattention_layer_type: 'rel_selfattn'


joint_conf:
    join_dim: 512
    prejoin_linear: True
    postjoin_linear: false
    joint_mode: 'add'
    activation: 'tanh'

predictor: rnn
predictor_conf:
    embed_size: 512
    output_size: 512
    embed_dropout: 0.1
    hidden_size: 512
    num_layers: 2
    bias: true
    rnn_type: 'lstm'
    dropout: 0.1

decoder: bitransformer
decoder_conf:
  attention_heads: 8
  dropout_rate: 0.1
  linear_units: 2048
  num_blocks: 3
  positional_dropout_rate: 0.1
  r_num_blocks: 3
  self_attention_dropout_rate: 0.1
  src_attention_dropout_rate: 0.1

# hybrid transducer+ctc+attention
model_conf:
    transducer_weight: 0.75
    ctc_weight: 0.1
    attention_weight: 0.15
    lsm_weight: 0.1     # label smoothing option
    length_normalized_loss: false
    reverse_weight: 0.3

dataset_conf:
    filter_conf:
        max_length: 1650
        min_length: 10
        token_max_length: 200
        token_min_length: 1
    resample_conf:
        resample_rate: 16000
    speed_perturb: true
    fbank_conf:
        num_mel_bins: 80
        frame_shift: 10
        frame_length: 25
        dither: 0.1
    spec_aug: true
    spec_aug_conf:
        num_t_mask: 2
        num_f_mask: 2
        max_t: 50
        max_f: 10
    shuffle: true
    shuffle_conf:
        shuffle_size: 1500
    sort: true
    sort_conf:
        sort_size: 500  # sort_size should be less than shuffle_size
    batch_conf:
        batch_type: 'dynamic' # static or dynamic
        max_frames_in_batch: 4000

grad_clip: 4
accum_grad: 1
max_epoch: 140
log_interval: 100

optim: adam
optim_conf:
    lr: 0.001
scheduler: warmuplr     # pytorch v1.1.0+ required
scheduler_conf:
    warmup_steps: 25000