root
add pretrained models
6bacacb
raw
history blame
2.31 kB
# network architecture
# encoder related
encoder: conformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.1
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
cnn_module_kernel: 15
use_cnn_module: true
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
joint_conf:
join_dim: 512
prejoin_linear: True
postjoin_linear: false
joint_mode: 'add'
activation: 'tanh'
predictor: rnn
predictor_conf:
embed_size: 256
output_size: 256
embed_dropout: 0.1
hidden_size: 256
num_layers: 2
bias: true
rnn_type: 'lstm'
dropout: 0.1
decoder: bitransformer
decoder_conf:
attention_heads: 4
dropout_rate: 0.1
linear_units: 2048
num_blocks: 3
positional_dropout_rate: 0.1
r_num_blocks: 3
self_attention_dropout_rate: 0.1
src_attention_dropout_rate: 0.1
# hybrid transducer+ctc+attention
model_conf:
transducer_weight: 0.75
ctc_weight: 0.1
attention_weight: 0.15
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
reverse_weight: 0.3
dataset_conf:
filter_conf:
max_length: 1650
min_length: 10
token_max_length: 200
token_min_length: 1
resample_conf:
resample_rate: 16000
speed_perturb: true
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 0.1
spec_aug: true
spec_aug_conf:
num_t_mask: 2
num_f_mask: 2
max_t: 50
max_f: 10
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'dynamic' # static or dynamic
max_frames_in_batch: 4000
grad_clip: 4
accum_grad: 1
max_epoch: 140
log_interval: 100
optim: adam
optim_conf:
lr: 0.001
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 25000